[Pkg-opencl-devel] [beignet] 57/66: Imported Upstream version 0.8

Andreas Beckmann anbe at moszumanska.debian.org
Fri Oct 31 07:27:09 UTC 2014


This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch master
in repository beignet.

commit 1632e10238b43f4904df4b09ea10d7851fd3aad3
Author: Simon Richter <sjr at debian.org>
Date:   Sat Apr 19 15:52:26 2014 +0200

    Imported Upstream version 0.8
---
 CMake/FindDRM.cmake                                |     7 +-
 CMake/FindDRMIntel.cmake                           |     7 +-
 CMake/FindEGL.cmake                                |     1 +
 CMake/FindGBE.cmake                                |    37 -
 CMake/FindLLVM.cmake                               |    20 +-
 CMakeLists.txt                                     |    28 +-
 README.md                                          |     3 +-
 backend/CMakeLists.txt                             |     6 +
 backend/src/CMakeLists.txt                         |    72 +-
 backend/src/GBEConfig.h.in                         |     1 +
 backend/src/backend/context.cpp                    |    46 +-
 backend/src/backend/gen/gen_mesa_disasm.c          |     2 +
 backend/src/backend/gen_context.cpp                |   388 +-
 backend/src/backend/gen_context.hpp                |    18 +-
 backend/src/backend/gen_defs.hpp                   |     2 +
 backend/src/backend/gen_encoder.cpp                |    41 +-
 backend/src/backend/gen_encoder.hpp                |     2 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |     2 +-
 backend/src/backend/gen_insn_selection.cpp         |   530 +-
 backend/src/backend/gen_insn_selection.hpp         |    15 +-
 backend/src/backend/gen_insn_selection.hxx         |     5 +-
 backend/src/backend/gen_program.cpp                |    30 +-
 backend/src/backend/gen_program.h                  |     2 -
 backend/src/backend/gen_reg_allocation.cpp         |   419 +-
 backend/src/backend/gen_reg_allocation.hpp         |     7 +
 backend/src/backend/gen_register.hpp               |    17 +-
 backend/src/backend/program.cpp                    |   276 +-
 backend/src/backend/program.h                      |    20 +-
 backend/src/backend/program.hpp                    |    19 +-
 backend/src/builtin_vector_proto.def               |    28 +-
 backend/src/gbe_bin_generater.cpp                  |     2 +-
 backend/src/gen_as.sh                              |    24 +-
 backend/src/gen_builtin_vector.py                  |    30 +-
 backend/src/gen_convert.sh                         |   351 +-
 backend/src/genconfig.sh                           |     1 +
 backend/src/ir/context.cpp                         |     4 +-
 backend/src/ir/context.hpp                         |     9 +-
 backend/src/ir/function.cpp                        |     4 +-
 backend/src/ir/function.hpp                        |    20 +-
 backend/src/ir/image.cpp                           |    23 +
 backend/src/ir/image.hpp                           |     5 +
 backend/src/ir/instruction.cpp                     |   260 +-
 backend/src/ir/instruction.hpp                     |    62 +-
 backend/src/ir/instruction.hxx                     |     6 +
 backend/src/ir/liveness.cpp                        |   174 +-
 backend/src/ir/liveness.hpp                        |    28 +-
 backend/src/ir/lowering.cpp                        |    33 +-
 backend/src/ir/profile.cpp                         |     7 +-
 backend/src/ir/profile.hpp                         |     6 +-
 backend/src/ir/register.hpp                        |     5 +
 backend/src/ir/sampler.cpp                         |    68 +-
 backend/src/ir/sampler.hpp                         |    19 +-
 backend/src/llvm/llvm_gen_backend.cpp              |   382 +-
 backend/src/llvm/llvm_gen_backend.hpp              |     4 +
 backend/src/llvm/llvm_gen_ocl_function.hxx         |    31 +-
 backend/src/llvm/llvm_intrinsic_lowering.cpp       |   172 +
 backend/src/llvm/llvm_passes.cpp                   |    42 +-
 backend/src/llvm/llvm_scalarize.cpp                |    33 +-
 backend/src/llvm/llvm_to_gen.cpp                   |   130 +-
 backend/src/llvm/llvm_to_gen.hpp                   |     5 +-
 backend/src/ocl_as.h                               |  1285 +-
 backend/src/ocl_barrier.ll                         |    39 +
 backend/src/ocl_convert.h                          | 13679 ++++++++++++++++++-
 backend/src/ocl_memcpy.ll                          |   336 +
 backend/src/ocl_memset.ll                          |   127 +
 backend/src/ocl_stdlib.tmpl.h                      |  2487 +++-
 backend/src/update_blob_ocl_header.py              |     8 +-
 docs/Beignet.mdwn                                  |   146 +-
 docs/Beignet/Backend/TODO.mdwn                     |    42 +-
 docs/Beignet/Backend/compiler_backend.mdwn         |    30 +-
 docs/Beignet/Backend/gen_ir.mdwn                   |    12 +-
 intel-beignet.icd.in                               |     1 +
 kernels/builtin_exp.cl                             |    10 +
 kernels/builtin_pow.cl                             |     7 +
 kernels/compiler_function_argument3.cl             |    71 +
 kernels/compiler_global_constant.cl                |     2 +-
 kernels/compiler_long.cl                           |     7 +-
 kernels/compiler_menger_sponge_no_shadow.cl        |     2 +-
 kernels/compiler_private_data_overflow.cl          |    10 +
 src/CMakeLists.txt                                 |     7 +-
 src/OCLConfig.h.in                                 |     1 +
 src/cl_api.c                                       |   226 +-
 src/cl_command_queue.c                             |    34 +-
 src/cl_command_queue.h                             |     9 +-
 src/cl_command_queue_gen7.c                        |    54 +-
 src/cl_context.c                                   |     2 +
 src/cl_device_data.h                               |    70 +-
 src/cl_device_id.c                                 |   181 +-
 src/cl_device_id.h                                 |    13 +-
 src/cl_driver.h                                    |    22 +-
 src/cl_driver_defs.c                               |     5 +-
 src/cl_enqueue.c                                   |    48 +-
 src/cl_enqueue.h                                   |     2 +-
 src/cl_event.c                                     |    48 +-
 src/cl_event.h                                     |     3 +-
 src/cl_gen75_device.h                              |     2 +-
 src/cl_gen7_device.h                               |     2 +-
 src/cl_gt_device.h                                 |    17 +-
 src/cl_kernel.c                                    |    17 +-
 src/cl_kernel.h                                    |     5 +-
 src/cl_khr_icd.c                                   |     9 +-
 src/cl_mem.c                                       |   200 +-
 src/cl_mem.h                                       |    10 +-
 src/cl_platform_id.h                               |     3 +-
 src/cl_program.c                                   |    48 +-
 src/cl_program.h                                   |     3 +
 src/cl_sampler.c                                   |    10 +-
 src/cl_thread.c                                    |   127 +
 src/cl_thread.h                                    |    44 +
 src/intel/intel_driver.c                           |     9 +-
 src/intel/intel_gpgpu.c                            |    71 +-
 utests/.gitignore                                  |     2 +
 utests/CMakeLists.txt                              |    45 +-
 utests/builtin_exp.cpp                             |   102 +
 utests/builtin_pow.cpp                             |    92 +
 utests/compiler_abs.cpp                            |     9 +-
 utests/compiler_abs_diff.cpp                       |    15 +-
 utests/compiler_basic_arithmetic.cpp               |    40 +-
 utests/compiler_bool_cross_basic_block.cpp         |     2 +-
 utests/compiler_function_argument3.cpp             |    45 +
 utests/compiler_global_constant.cpp                |     6 +-
 utests/compiler_group_size.cpp                     |     6 +-
 utests/compiler_long.cpp                           |     2 +
 utests/compiler_long_cmp.cpp                       |     5 +
 utests/compiler_long_convert.cpp                   |     4 +-
 utests/compiler_private_data_overflow.cpp          |    15 +
 utests/compiler_step.cpp                           |    10 +-
 utests/compiler_vector_load_store.cpp              |    36 +-
 utests/get_cl_info.cpp                             |    25 +-
 utests/runtime_createcontext.cpp                   |     5 +-
 utests/setenv.sh.in                                |     5 +
 utests/sub_buffer.cpp                              |   135 +
 utests/utest.cpp                                   |    11 +-
 utests/utest.hpp                                   |     9 +-
 utests/utest_file_map.cpp                          |     2 +-
 utests/utest_generator.py                          |   374 +
 utests/utest_helper.cpp                            |    90 +-
 utests/utest_helper.hpp                            |    13 +-
 utests/utest_math_gen.py                           |   519 +
 utests/utest_run.cpp                               |     4 +-
 140 files changed, 23483 insertions(+), 1739 deletions(-)

diff --git a/CMake/FindDRM.cmake b/CMake/FindDRM.cmake
index 3de35bf..a5a4ebc 100644
--- a/CMake/FindDRM.cmake
+++ b/CMake/FindDRM.cmake
@@ -7,7 +7,11 @@
 # DRM_LIBRARY
 # 
 
-FIND_PATH(DRM_INCLUDE_PATH drm.h
+FIND_PATH(DRM_INCLUDE_PATH
+  NAMES
+  drm.h
+  PATHS
+  ${CMAKE_INCLUDE_PATH}/include/libdrm/
   ~/include/libdrm/
   /usr/include/libdrm/
   /usr/local/include/libdrm/
@@ -17,6 +21,7 @@ FIND_PATH(DRM_INCLUDE_PATH drm.h
 FIND_LIBRARY(DRM_LIBRARY
   NAMES DRM drm
   PATHS
+  ${CMAKE_LIBRARY_PATH}/lib/
   ~/lib/
   /usr/lib64
   /usr/lib
diff --git a/CMake/FindDRMIntel.cmake b/CMake/FindDRMIntel.cmake
index 2d45c64..0aab1c7 100644
--- a/CMake/FindDRMIntel.cmake
+++ b/CMake/FindDRMIntel.cmake
@@ -6,7 +6,11 @@
 # DRM_INTEL_INCLUDE_PATH
 # 
 
-FIND_PATH(DRM_INTEL_INCLUDE_PATH intel_bufmgr.h
+FIND_PATH(DRM_INTEL_INCLUDE_PATH
+  NAMES
+  intel_bufmgr.h
+  PATHS
+  ${CMAKE_INCLUDE_PATH}/include/libdrm/
   ~/include/libdrm/
   /usr/include/libdrm/
   /usr/local/include/libdrm/
@@ -17,6 +21,7 @@ FIND_PATH(DRM_INTEL_INCLUDE_PATH intel_bufmgr.h
 FIND_LIBRARY(DRM_INTEL_LIBRARY
   NAMES DRM_INTEL drm_intel
   PATHS
+  ${CMAKE_LIBRARY_PATH}/lib/
   ~/lib/
   /usr/lib64
   /usr/lib
diff --git a/CMake/FindEGL.cmake b/CMake/FindEGL.cmake
index d84ef95..597b4a5 100644
--- a/CMake/FindEGL.cmake
+++ b/CMake/FindEGL.cmake
@@ -45,6 +45,7 @@ SET(MESA_SOURCE_INCLUDES ${MESA_SOURCE_PREFIX}/src/mesa
                          ${MESA_SOURCE_PREFIX}/include
                          ${MESA_SOURCE_PREFIX}/src/mapi
                          ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/i965/
+                         ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/intel/
                          ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/common/)
 SET(MESA_SOURCE_FOUND 1 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
 ELSE(MESA_SOURCE_PREFIX)
diff --git a/CMake/FindGBE.cmake b/CMake/FindGBE.cmake
deleted file mode 100644
index db938c7..0000000
--- a/CMake/FindGBE.cmake
+++ /dev/null
@@ -1,37 +0,0 @@
-#
-# Try to find X library and include path.
-# Once done this will define
-#
-# GBE_FOUND
-# GBE_INCLUDE_PATH
-# GBE_LIBRARY
-# 
-
-FIND_PATH(GBE_INCLUDE_PATH gen/program.h
-  ~/include/
-  /usr/include/
-  /usr/local/include/
-  /sw/include/
-  /opt/local/include/
-  DOC "The directory where gen/program.h resides")
-FIND_LIBRARY(GBE_LIBRARY
-  NAMES GBE gbe
-  PATHS
-  ~/lib/
-  /usr/lib64
-  /usr/lib
-  /usr/local/lib64
-  /usr/local/lib
-  /sw/lib
-  /opt/local/lib
-  DOC "The GBE library")
-
-IF(GBE_INCLUDE_PATH)
-  INCLUDE_DIRECTORIES(${GBE_INCLUDE_PATH})
-  SET(GBE_FOUND 1 CACHE STRING "Set to 1 if GBE is found, 0 otherwise")
-ELSE(GBE_INCLUDE_PATH)
-  SET(GBE_FOUND 0 CACHE STRING "Set to 1 if GBE is found, 0 otherwise")
-ENDIF(GBE_INCLUDE_PATH)
-
-MARK_AS_ADVANCED(GBE_FOUND)
-
diff --git a/CMake/FindLLVM.cmake b/CMake/FindLLVM.cmake
index 3fa9ad9..97ee7db 100644
--- a/CMake/FindLLVM.cmake
+++ b/CMake/FindLLVM.cmake
@@ -7,9 +7,14 @@
 # LLVM_MODULE_LIBS - list of llvm libs for working with modules.
 # LLVM_FOUND       - True if llvm found.
 if (LLVM_INSTALL_DIR)
-  find_program(LLVM_CONFIG_EXECUTABLE NAMES llvm-config-32 llvm-config-3.2 llvm-config-31 llvm-config-3.1 llvm-config-3.4 llvm-config DOC "llvm-config executable" PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
+  find_program(LLVM_CONFIG_EXECUTABLE
+               NAMES llvm-config-33 llvm-config-3.3 llvm-config-35 llvm-config-3.5 llvm-config-34 llvm-config-3.4 llvm-config
+               DOC "llvm-config executable"
+               PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
 else (LLVM_INSTALL_DIR)
-  find_program(LLVM_CONFIG_EXECUTABLE NAMES llvm-config-32 llvm-config-3.2 llvm-config-31 llvm-config-3.1 llvm-config-3.4 llvm-config DOC "llvm-config executable")
+  find_program(LLVM_CONFIG_EXECUTABLE
+               NAMES llvm-config-33 llvm-config-3.3 llvm-config-35 llvm-config-3.5 llvm-config-34 llvm-config-3.4 llvm-config
+               DOC "llvm-config executable")
 endif (LLVM_INSTALL_DIR)
 
 if (LLVM_CONFIG_EXECUTABLE)
@@ -24,7 +29,7 @@ if (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
     COMMAND ${LLVM_CONFIG_EXECUTABLE} --version
     OUTPUT_VARIABLE LLVM_VERSION
   )
-  string(REGEX REPLACE "([0-9]*)\\.([0-9]*)[^0-9]*" "\\1\\2 " LLVM_VERSION_NODOT ${LLVM_VERSION})
+  string(REGEX REPLACE "([0-9])\\.([0-9]*).*" "\\1\\2 " LLVM_VERSION_NODOT ${LLVM_VERSION})
   if (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
     message(FATAL_ERROR "imcompatible LLVM version ${LLVM_VERSION} required ${LLVM_FIND_VERSION}")
   else (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
@@ -67,6 +72,15 @@ execute_process(
   OUTPUT_STRIP_TRAILING_WHITESPACE
 )
 
+if (LLVM_VERSION_NODOT VERSION_GREATER 34)
+execute_process(
+  COMMAND ${LLVM_CONFIG_EXECUTABLE} --system-libs
+  OUTPUT_VARIABLE LLVM_SYSTEM_LIBS_ORIG
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+string(REGEX REPLACE " *\n" "" LLVM_SYSTEM_LIBS ${LLVM_SYSTEM_LIBS_ORIG})
+endif (LLVM_VERSION_NODOT VERSION_GREATER 34)
+
 macro(add_one_lib name)
   FIND_LIBRARY(CLANG_LIB
     NAMES ${name}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2ec6c08..4ed27b5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,8 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
 PROJECT(OCL)
 set (LIBCL_DRIVER_VERSION_MAJOR 0)
-set (LIBCL_DRIVER_VERSION_MINOR 3)
+set (LIBCL_DRIVER_VERSION_MINOR 8)
+set (LIBCL_DRIVER_VERSION_PATCH 0)
 set (LIBCL_C_VERSION_MAJOR 1)
 set (LIBCL_C_VERSION_MINOR 1)
 
@@ -22,6 +23,9 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
 
 SET(CMAKE_VERBOSE_MAKEFILE "false")
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMake/")
+if (NOT LIB_INSTALL_DIR)
+  set (LIB_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/lib")
+endif (NOT LIB_INSTALL_DIR)
 SET(EMULATE_IVB false CACHE BOOL "To emulate IVB")
 SET(EMULATE_SNB false CACHE BOOL "To emulate SNB")
 SET(EMULATE_HSW false CACHE BOOL "To emulate HSW")
@@ -65,7 +69,7 @@ SET(CMAKE_C_FLAGS "-Wall -mfpmath=sse -msse2 -Wcast-align -msse2 -msse3 -mssse3
 
 # Front end stuff we need
 #INCLUDE(CMake/FindLLVM.cmake)
-Find_Package(LLVM 3.1)
+Find_Package(LLVM 3.3)
 
 # XLib
 Find_Package(X11)
@@ -111,14 +115,6 @@ ELSE(XFIXES_FOUND)
   MESSAGE(STATUS "Looking for Xfixes - not found")
 ENDIF(XFIXES_FOUND)
 
-# Gen-backend (compiler)
-Find_Package(GBE)
-IF(GBE_FOUND)
-  MESSAGE(STATUS "Looking for Gen-Backend - found")
-ELSE(GBE_FOUND)
-  MESSAGE(STATUS "Looking for Gen-Backend - not found")
-ENDIF(GBE_FOUND)
-
 Find_Package(EGL)
 IF(EGL_FOUND)
   MESSAGE(STATUS "Looking for EGL - found")
@@ -135,6 +131,11 @@ ENDIF(MESA_SOURCE_FOUND)
 Find_Package(OCLIcd)
 IF(OCLIcd_FOUND)
   MESSAGE(STATUS "Looking for OCL ICD header file - found")
+  configure_file (
+    "intel-beignet.icd.in"
+    "intel-beignet.icd"
+  )
+  install (FILES ${CMAKE_CURRENT_BINARY_DIR}/intel-beignet.icd DESTINATION /etc/OpenCL/vendors)
 ELSE(OCLIcd_FOUND)
   MESSAGE(STATUS "Looking for OCL ICD header file - not found")
 ENDIF(OCLIcd_FOUND)
@@ -146,3 +147,10 @@ ADD_SUBDIRECTORY(backend)
 ADD_SUBDIRECTORY(src)
 ADD_SUBDIRECTORY(utests)
 
+SET(CPACK_PACKAGE_VERSION_MAJOR "${LIBCL_DRIVER_VERSION_MAJOR}")
+SET(CPACK_PACKAGE_VERSION_MINOR "${LIBCL_DRIVER_VERSION_MINOR}")
+SET(CPACK_PACKAGE_VERSION_PATCH "${LIBCL_DRIVER_VERSION_PATCH}")
+SET(CPACK_SOURCE_GENERATOR "TGZ;TZ")
+SET(CPACK_PACKAGE_NAME "Beignet")
+SET(CPACK_PACKAGE_VENDOR "Intel Open Source Technology Center")
+INCLUDE(CPack)
diff --git a/README.md b/README.md
deleted file mode 100644
index cbaeaa9..0000000
--- a/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-We host documents at the following wiki page:
-[http://wiki.freedesktop.org/www/Software/Beignet](http://wiki.freedesktop.org/www/Software/Beignet)
diff --git a/README.md b/README.md
new file mode 120000
index 0000000..b9f23a8
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+docs/Beignet.mdwn
\ No newline at end of file
diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
index 476c6f2..dd55a4a 100644
--- a/backend/CMakeLists.txt
+++ b/backend/CMakeLists.txt
@@ -91,9 +91,15 @@ elseif (COMPILER STREQUAL "ICC")
   set (CCMAKE_CXX_FLAGS_MINSIZEREL "-Os -DGBE_DEBUG=0")
   set (CMAKE_EXE_LINKER_FLAGS "")
 endif ()
+
 include_directories (${CMAKE_CURRENT_BINARY_DIR})
 ##############################################################
 # Project source code
 ##############################################################
 add_subdirectory (src)
+set(LOCAL_PCH_OBJECT_DIR ${LOCAL_PCH_OBJECT_DIR} PARENT_SCOPE)
+set(LOCAL_PCM_OBJECT_DIR ${LOCAL_PCM_OBJECT_DIR} PARENT_SCOPE)
+set (GBE_BIN_GENERATER
+     OCL_PCM_PATH=${LOCAL_PCM_OBJECT_DIR} OCL_PCH_PATH=${LOCAL_PCH_OBJECT_DIR} ${CMAKE_CURRENT_BINARY_DIR}/src/gbe_bin_generater
+     PARENT_SCOPE)
 
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index 36bf688..33494a0 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -1,10 +1,13 @@
+set (beignet_install_path ${LIB_INSTALL_DIR}/beignet/)
+
+
 set (ocl_vector_spec_file ${GBE_SOURCE_DIR}/src/builtin_vector_proto.def)
 set (ocl_vector_file ${GBE_SOURCE_DIR}/src/ocl_vector.h)
 set (ocl_as_file ${GBE_SOURCE_DIR}/src/ocl_as.h)
 set (ocl_convert_file ${GBE_SOURCE_DIR}/src/ocl_convert.h)
 set (ocl_stdlib_tmpl_file ${GBE_SOURCE_DIR}/src/ocl_stdlib.tmpl.h)
 set (ocl_common_header_file ${GBE_SOURCE_DIR}/src/ocl_common_defines.h)
-set (ocl_blob_file ${CMAKE_CURRENT_BINARY_DIR}/ocl_stdlib.h)
+set (ocl_blob_file ${CMAKE_CURRENT_BINARY_DIR}${beignet_install_path}ocl_stdlib.h)
 set (ocl_blob_cpp_file ${GBE_SOURCE_DIR}/src/ocl_stdlib_str.cpp)
 set (ocl_gen_blob_cmd ${GBE_SOURCE_DIR}/src/update_blob_ocl_header.py)
 set (ocl_gen_vector_cmd ${GBE_SOURCE_DIR}/src/gen_builtin_vector.py)
@@ -23,28 +26,31 @@ add_custom_command(
     COMMAND echo "" >> ${ocl_blob_cpp_file}
     DEPENDS ${ocl_blob_file})
 
-set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "ocl_vector.h;ocl_stdlib.h")
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "ocl_vector.h;${ocl_blob_file}")
 
 add_custom_command(
   OUTPUT ${ocl_vector_file}
   COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_vector_cmd} ${ocl_vector_spec_file} ${ocl_vector_file}
-  DEPENDS ${ocl_gen_vector_cmd} ${ocl_vector_spec_file})
+  DEPENDS ${ocl_gen_vector_cmd} ${ocl_vector_spec_file}
+  )
 
 add_custom_command(
   OUTPUT ${ocl_blob_file}
+  COMMAND mkdir -p ${CMAKE_CURRENT_BINARY_DIR}/${beignet_install_path}
   COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_blob_file}
-  DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_common_header_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file})
-
+  DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_common_header_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file}
+  )
 
 set (pch_object ${ocl_blob_file}.pch)
+set (local_pch_object ${ocl_blob_file}.local.pch)
 # generate pch object
 if (LLVM_VERSION_NODOT VERSION_GREATER 32)
-    set (clang_cmd -cc1 -x cl -triple spir -ffp-contract=off -emit-pch)
+    set (clang_cmd -cc1 -x cl -triple spir -ffp-contract=off)
 else (LLVM_VERSION_NODOT VERSION_GREATER 32)
     if (LLVM_VERSION_NODOT VERSION_GREATER 31)
-        set (clang_cmd -cc1 -x cl -triple nvptx -ffp-contract=off -emit-pch)
+        set (clang_cmd -cc1 -x cl -triple nvptx -ffp-contract=off)
     else (LLVM_VERSION_NODOT VERSION_GREATER 31)
-        set (clang_cmd -cc1 -x cl -triple ptx32 -emit-pch)
+        set (clang_cmd -cc1 -x cl -triple ptx32)
     endif (LLVM_VERSION_NODOT VERSION_GREATER 31)
 endif (LLVM_VERSION_NODOT VERSION_GREATER 32)
 set (clang_cmd ${clang_cmd} -fno-builtin -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
@@ -52,20 +58,40 @@ set (clang_cmd ${clang_cmd} -fno-builtin -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
 add_custom_command(
      OUTPUT ${pch_object}
      COMMAND rm -f ${pch_object}
-     COMMAND clang ${clang_cmd} ${ocl_blob_file} -o ${pch_object}
+     COMMAND clang ${clang_cmd} --relocatable-pch -emit-pch -isysroot ${CMAKE_CURRENT_BINARY_DIR} ${ocl_blob_file} -o ${pch_object}
+     COMMAND clang ${clang_cmd} -emit-pch ${ocl_blob_file} -o ${local_pch_object}
      DEPENDS ${ocl_blob_file}
      )
 
 add_custom_target(pch_object
                   DEPENDS ${pch_object})
 
+macro(ll_add_library ll_lib ll_sources)
+  foreach (ll ${${ll_sources}})
+  add_custom_command(
+       OUTPUT  ${ll}.bc
+       COMMAND rm -f ${ll}.bc
+       COMMAND llvm-as -o ${ll}.bc ${GBE_SOURCE_DIR}/src/${ll}
+       DEPENDS ${ll}
+       )
+  set (ll_objects ${ll_objects} ${ll}.bc)
+  endforeach (ll ${ll_sources})
+  add_custom_command(
+       OUTPUT ${ll_lib}
+       COMMAND llvm-link -o ${ll_lib} ${ll_objects}
+       DEPENDS ${ll_objects}
+       )
+  add_custom_target(${ll_lib}
+                    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${ll_lib})
+endmacro(ll_add_library)
+
 if (GBE_USE_BLOB)
   set (GBE_SRC
        blob.cpp
        backend/gen/gen_mesa_disasm.c)
 else (GBE_USE_BLOB)
   set (GBE_SRC
-    ocl_stdlib.h
+    ${ocl_blob_file}
     ocl_stdlib_str.cpp  # this file is auto-generated.
     sys/vector.hpp
     sys/hash_map.hpp
@@ -117,6 +143,7 @@ else (GBE_USE_BLOB)
     llvm/llvm_gen_backend.cpp
     llvm/llvm_passes.cpp
     llvm/llvm_scalarize.cpp
+    llvm/llvm_intrinsic_lowering.cpp
     llvm/llvm_to_gen.cpp
     llvm/llvm_gen_backend.hpp
     llvm/llvm_gen_ocl_function.hxx
@@ -142,9 +169,14 @@ endif (GBE_USE_BLOB)
 include_directories (.)
 link_directories (${LLVM_LIBRARY_DIRS})
 include_directories(${LLVM_INCLUDE_DIRS})
-add_library (gbe SHARED ${GBE_SRC})
+add_library (gbe STATIC ${GBE_SRC})
+
+# for pre compiled module library.
+set (pcm_lib "beignet.bc")
+set (pcm_sources ocl_barrier.ll ocl_memset.ll ocl_memcpy.ll)
+ll_add_library (${pcm_lib} pcm_sources)
 
-ADD_DEPENDENCIES (gbe pch_object)
+ADD_DEPENDENCIES (gbe pch_object ${pcm_lib})
 target_link_libraries(
                       gbe
                       ${DRM_INTEL_LIBRARY}
@@ -152,6 +184,7 @@ target_link_libraries(
                       ${OPENGL_LIBRARIES}
                       ${CLANG_LIBRARIES}
                       ${LLVM_MODULE_LIBS}
+                      ${LLVM_SYSTEM_LIBS}
                       ${CMAKE_THREAD_LIBS_INIT}
                       ${CMAKE_DL_LIBS})
 
@@ -159,11 +192,18 @@ link_directories (${LLVM_LIBRARY_DIR})
 ADD_EXECUTABLE(gbe_bin_generater gbe_bin_generater.cpp)
 TARGET_LINK_LIBRARIES(gbe_bin_generater gbe)
 
-install (TARGETS gbe LIBRARY DESTINATION lib)
-install (FILES ${pch_object} DESTINATION lib)
-install (FILES backend/program.h DESTINATION include/gen)
+#install (TARGETS gbe LIBRARY DESTINATION lib)
+#install (FILES backend/program.h DESTINATION include/gen)
+install (FILES ${ocl_blob_file} DESTINATION ${LIB_INSTALL_DIR}/beignet)
+install (FILES ${pch_object} DESTINATION ${LIB_INSTALL_DIR}/beignet)
+install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${pcm_lib} DESTINATION ${LIB_INSTALL_DIR}/beignet)
+# When build beignet itself, we need to export the local precompiled header file and precompiled module
+# file to libcl and utests.
+set (LOCAL_PCH_OBJECT_DIR "${local_pch_object}:${beignet_install_path}/ocl_stdlib.h.pch" PARENT_SCOPE)
+set (LOCAL_PCM_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/${pcm_lib}:${beignet_install_path}/${pcm_lib}" PARENT_SCOPE)
 
-set (PCH_OBJECT_DIR "${pch_object};${CMAKE_INSTALL_PREFIX}/lib/ocl_stdlib.h.pch")
+set (PCH_OBJECT_DIR "${beignet_install_path}/ocl_stdlib.h.pch")
+set (PCM_OBJECT_DIR "${beignet_install_path}/${pcm_lib}")
 configure_file (
   "GBEConfig.h.in"
   "GBEConfig.h"
diff --git a/backend/src/GBEConfig.h.in b/backend/src/GBEConfig.h.in
index 74bef3f..5bc09b8 100644
--- a/backend/src/GBEConfig.h.in
+++ b/backend/src/GBEConfig.h.in
@@ -2,3 +2,4 @@
 #define LIBGBE_VERSION_MAJOR @LIBGBE_VERSION_MAJOR@
 #define LIBGBE_VERSION_MINOR @LIBGBE_VERSION_MINOR@
 #define PCH_OBJECT_DIR "@PCH_OBJECT_DIR@"
+#define PCM_OBJECT_DIR "@PCM_OBJECT_DIR@"
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 25d4f9c..2125bd1 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -389,7 +389,12 @@ namespace gbe
       return;
     // Be sure that the stack pointer is set
     GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
-    this->kernel->stackSize = 1*KB; // XXX compute that in a better way
+    uint32_t stackSize = 1*KB;
+    while (stackSize < fn.getStackSize()) {
+      stackSize <<= 1;
+      GBE_ASSERT(stackSize <= 64*KB);
+    }
+    this->kernel->stackSize = stackSize;
   }
 
   uint32_t Context::newCurbeEntry(gbe_curbe_type value,
@@ -409,7 +414,7 @@ namespace gbe
   {
     int32_t offset = fn.getImageSet()->getInfoOffset(key);
     if (offset >= 0)
-      return offset;
+      return offset + GEN_REG_SIZE;
     newCurbeEntry(GBE_CURBE_IMAGE_INFO, key.data, size, 4);
     std::sort(kernel->patches.begin(), kernel->patches.end());
 
@@ -430,6 +435,9 @@ namespace gbe
 
     // We insert the block IP mask first
     this->insertCurbeReg(ir::ocl::blockip, this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t)));
+    this->insertCurbeReg(ir::ocl::emask, this->newCurbeEntry(GBE_CURBE_EMASK, 0,  sizeof(uint32_t)));
+    this->insertCurbeReg(ir::ocl::notemask, this->newCurbeEntry(GBE_CURBE_NOT_EMASK, 0, sizeof(uint32_t)));
+    this->insertCurbeReg(ir::ocl::barriermask, this->newCurbeEntry(GBE_CURBE_BARRIER_MASK, 0, sizeof(uint32_t)));
 
     // Go over the arguments and find the related patch locations
     const uint32_t argNum = fn.argNum();
@@ -451,7 +459,6 @@ namespace gbe
     insertCurbeReg(ir::ocl::lid0, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_X, 0, localIDSize));
     insertCurbeReg(ir::ocl::lid1, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Y, 0, localIDSize));
     insertCurbeReg(ir::ocl::lid2, this->newCurbeEntry(GBE_CURBE_LOCAL_ID_Z, 0, localIDSize));
-            insertCurbeReg(ir::ocl::samplerinfo, this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32));
 
     // Go over all the instructions and find the special register we need
     // to push
@@ -468,26 +475,20 @@ namespace gbe
         const ir::Register reg = insn.getSrc(srcID);
         if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
           if (srcID != 0) continue;
-          const unsigned char bti = fn.getImageSet()->getIdx(insn.getSrc(srcID));
+          const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
           const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
-          ir::ImageInfoKey key;
-          key.index = bti;
-          key.type = type;
-          const ir::Register imageInfo(key.data | 0x8000);
-          ir::Register realImageInfo;
+          ir::ImageInfoKey key(bti, type);
+          const ir::Register imageInfo = insn.getSrc(0);
           if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
             uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
-            realImageInfo = insn.getSrc(1);
-            insertCurbeReg(realImageInfo, offset);
-            insertCurbeReg(imageInfo, (uint32_t)realImageInfo);
-          } else
-            realImageInfo = ir::Register(curbeRegs.find(imageInfo)->second);
-          insn.setSrc(srcID, realImageInfo);
+            insertCurbeReg(imageInfo, offset);
+          }
           continue;
         } else if (insn.getOpcode() == ir::OP_GET_SAMPLER_INFO) {
           /* change the src to sampler information register. */
-          if (curbeRegs.find(ir::ocl::samplerinfo) == curbeRegs.end())
-            insertCurbeReg(ir::ocl::samplerinfo, this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32));
+          GBE_ASSERT(insn.getSrc(0) == ir::ocl::samplerinfo);
+          if (curbeRegs.find(insn.getSrc(0)) == curbeRegs.end())
+            insertCurbeReg(insn.getSrc(0), this->newCurbeEntry(GBE_CURBE_SAMPLER_INFO, 0, 32));
           continue;
         }
         if (fn.isSpecialReg(reg) == false) continue;
@@ -512,9 +513,6 @@ namespace gbe
     });
 #undef INSERT_REG
 
-    // Insert the number of threads
-    insertCurbeReg(ir::ocl::threadn, this->newCurbeEntry(GBE_CURBE_THREAD_NUM, 0, sizeof(uint32_t)));
-
     // Insert the stack buffer if used
     if (useStackPtr)
       insertCurbeReg(ir::ocl::stackptr, this->newCurbeEntry(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER, ptrSize));
@@ -534,6 +532,8 @@ namespace gbe
       kernel->args = NULL;
     for (uint32_t argID = 0; argID < kernel->argNum; ++argID) {
       const auto &arg = fn.getArg(argID);
+
+      kernel->args[argID].align = arg.align;
       switch (arg.type) {
         case ir::FunctionArgument::VALUE:
         case ir::FunctionArgument::STRUCTURE:
@@ -684,7 +684,11 @@ namespace gbe
         reg == ir::ocl::goffset0  ||
         reg == ir::ocl::goffset1  ||
         reg == ir::ocl::goffset2  ||
-        reg == ir::ocl::workdim)
+        reg == ir::ocl::workdim   ||
+        reg == ir::ocl::emask     ||
+        reg == ir::ocl::notemask  ||
+        reg == ir::ocl::barriermask
+      )
       return true;
     return false;
   }
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index f911e7c..1f5adc9 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -65,6 +65,8 @@ static const struct {
   [GEN_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_FBH] = { .name = "fbh", .nsrc = 1, .ndst = 1 },
   [GEN_OPCODE_FBL] = { .name = "fbl", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_F16TO32] = { .name = "f16to32", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_F32TO16] = { .name = "f32to16", .nsrc = 1, .ndst = 1 },
 
   [GEN_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
   [GEN_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 43b3bc7..d72b19b 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -33,6 +33,8 @@
 #include "ir/function.hpp"
 #include "sys/cvar.hpp"
 #include <cstring>
+#include <iostream>
+#include <iomanip>
 
 namespace gbe
 {
@@ -91,15 +93,50 @@ namespace gbe
   void GenContext::clearFlagRegister(void) {
     // when group size not aligned to simdWidth, flag register need clear to
     // make prediction(any8/16h) work correctly
+    const GenRegister emaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::emask));
+    const GenRegister notEmaskReg = ra->genReg(GenRegister::uw1grf(ir::ocl::notemask));
+    uint32_t execWidth = p->curr.execWidth;
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
     p->curr.noMask = 1;
+    /* clear all the bit in f0.0. */
     p->curr.execWidth = 1;
-    p->MOV(GenRegister::retype(GenRegister::flag(0,0), GEN_TYPE_UD), GenRegister::immud(0x0));
-    p->MOV(GenRegister::retype(GenRegister::flag(1,0), GEN_TYPE_UD), GenRegister::immud(0x0));
+    p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), GenRegister::immuw(0x0000));
+    /* clear the barrier mask bits to all zero0*/
+    p->curr.noMask = 0;
+    p->curr.useFlag(0, 0);
+    p->curr.execWidth = execWidth;
+    /* set all the active lane to 1. Inactive lane remains 0. */
+    p->CMP(GEN_CONDITIONAL_EQ, GenRegister::ud16grf(126, 0), GenRegister::ud16grf(126, 0));
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+    p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW));
+    p->XOR(notEmaskReg, emaskReg, GenRegister::immuw(0xFFFF));
+    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), notEmaskReg);
     p->pop();
   }
 
+  //Each emit function should only using one flag reg, otherwise, should handle the case both use f0.1
+  GenRegister GenContext::checkFlagRegister(GenRegister flagReg) {
+    uint32_t nr=0, subnr=0;
+    if(flagReg.file == GEN_ARCHITECTURE_REGISTER_FILE) {
+      assert(flagReg.nr >= GEN_ARF_FLAG && flagReg.nr < GEN_ARF_MASK);
+      return flagReg;
+    }
+
+    //flagReg is grf register, use f0.1, so f0.1 shouldn't be in use.
+    //Only check curr in the GenInstructionState stack, it seems enough now.
+    //Should check other GenInstructionState in the stack if needed in future.
+    if(p->curr.predicate == GEN_PREDICATE_NORMAL) {
+      nr = p->curr.flag;
+      subnr = p->curr.subFlag;
+      //TODO: Add mov to save/restore if f0.1 is in use
+      assert(!(nr == 0 && subnr == 2));
+    }
+
+    return GenRegister::flag(0, 1);
+  }
+
   void GenContext::emitStackPointer(void) {
     using namespace ir;
 
@@ -158,6 +195,8 @@ namespace gbe
       case SEL_OP_RNDU: p->RNDU(dst, src); break;
       case SEL_OP_RNDE: p->RNDE(dst, src); break;
       case SEL_OP_RNDZ: p->RNDZ(dst, src); break;
+      case SEL_OP_F16TO32: p->F16TO32(dst, src); break;
+      case SEL_OP_F32TO16: p->F32TO16(dst, src); break;
       case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); break;
       case SEL_OP_CONVI64_TO_I:
        {
@@ -169,7 +208,7 @@ namespace gbe
           p->curr.chooseNib(i);
           p->MOV(xdst, xsrc);
           xdst = GenRegister::suboffset(xdst, 4);
-          xsrc = GenRegister::suboffset(xsrc, 8);
+          xsrc = GenRegister::suboffset(xsrc, 4);
         }
         p->pop();
         break;
@@ -189,20 +228,6 @@ namespace gbe
       case SEL_OP_MOV_DF:
         p->MOV_DF(dst, src, tmp);
         break;
-      case SEL_OP_CONVF_TO_I64:
-       {
-        tmp.type = GEN_TYPE_F;
-        GenRegister d = GenRegister::retype(tmp, GEN_TYPE_D);
-        float c = (1.f / 65536.f) * (1.f / 65536.f);
-        p->MUL(tmp, src, GenRegister::immf(c));
-        p->RNDZ(tmp, tmp);
-        p->MOV(d, tmp);
-        storeTopHalf(dst, d);
-        d.type = GEN_TYPE_UD;
-        p->MOV(d, GenRegister::abs(src));
-        storeBottomHalf(dst, d);
-        break;
-       }
       case SEL_OP_CONVI_TO_I64: {
         GenRegister middle;
         if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_D) {
@@ -509,7 +534,7 @@ namespace gbe
     GenRegister g = ra->genReg(insn.dst(7));
     GenRegister h = ra->genReg(insn.dst(8));
     GenRegister i = ra->genReg(insn.dst(9));
-    GenRegister flagReg = ra->genReg(insn.dst(10));
+    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(10)));
     loadTopHalf(a, x);
     loadBottomHalf(b, x);
     loadTopHalf(c, y);
@@ -555,7 +580,7 @@ namespace gbe
     GenRegister g = ra->genReg(insn.dst(7));
     GenRegister h = ra->genReg(insn.dst(8));
     GenRegister i = ra->genReg(insn.dst(9));
-    GenRegister flagReg = ra->genReg(insn.dst(10));
+    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(10)));
     GenRegister zero = GenRegister::immud(0), one = GenRegister::immud(1);
     loadTopHalf(a, x);
     loadBottomHalf(b, x);
@@ -736,7 +761,7 @@ namespace gbe
     GenRegister e = ra->genReg(insn.dst(5));
     GenRegister f = ra->genReg(insn.dst(6));
     a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
-    GenRegister flagReg = ra->genReg(insn.dst(7));
+    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(7)));
     GenRegister zero = GenRegister::immud(0);
     switch(insn.opcode) {
       case SEL_OP_I64SHL:
@@ -811,12 +836,13 @@ namespace gbe
         p->SEL(d, d, e);
         p->curr.predicate = GEN_PREDICATE_NONE;
         p->AND(a, a, GenRegister::immud(32));
+        p->ASR(f, f, GenRegister::immd(31));
         p->MOV(flagReg, GenRegister::immuw(0xFFFF));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
         p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
         p->CMP(GEN_CONDITIONAL_Z, a, zero);
         p->SEL(d, d, c);
-        p->SEL(c, c, GenRegister::immd(-1));
+        p->SEL(c, c, f);
         p->pop();
         storeBottomHalf(dest, d);
         storeTopHalf(dest, c);
@@ -833,12 +859,95 @@ namespace gbe
     p->pop();
   }
 
-  void GenContext::UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister tmp) {
-    p->MOV(dst, high);
-    p->MUL(dst, dst, GenRegister::immf(65536.f * 65536.f));
-    tmp.type = GEN_TYPE_F;
-    p->MOV(tmp, low);
-    p->ADD(dst, dst, tmp);
+  void GenContext::UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister exp,
+                                            GenRegister mantissa, GenRegister tmp, GenRegister flag) {
+    uint32_t jip0, jip1;
+    GenRegister dst_ud = GenRegister::retype(dst, GEN_TYPE_UD);
+    p->FBH(exp, high);
+    p->ADD(exp, GenRegister::negate(exp), GenRegister::immud(31));  //exp = 32 when high == 0
+    p->push();
+      p->curr.useFlag(flag.flag_nr(), flag.flag_subnr());
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(32));   //high == 0
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(dst, low);
+      p->push();
+        if (simdWidth == 8)
+          p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+        else if (simdWidth == 16)
+          p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+        else
+          NOT_IMPLEMENTED;
+        p->curr.execWidth = 1;
+        p->curr.noMask = 1;
+        jip0 = p->n_instruction();
+        p->JMPI(GenRegister::immud(0));
+      p->pop();
+
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_G, exp, GenRegister::immud(23));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(32));  //exp>23 && high!=0
+      p->ADD(tmp, exp, GenRegister::immud(-23));
+      p->SHR(mantissa, high, tmp);
+      p->AND(mantissa, mantissa, GenRegister::immud(0x7fffff));
+      p->SHR(dst_ud, low, tmp);   //dst is temp regitster here
+      p->ADD(tmp, GenRegister::negate(tmp), GenRegister::immud(32));
+      p->SHL(high, high, tmp);
+      p->OR(high, high, dst_ud);
+      p->SHL(low, low, tmp);
+      p->push();
+        if (simdWidth == 8)
+          p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+        else if (simdWidth == 16)
+          p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+        else
+          NOT_IMPLEMENTED;
+        p->curr.execWidth = 1;
+        p->curr.noMask = 1;
+        jip1 = p->n_instruction();
+        p->JMPI(GenRegister::immud(0));
+      p->pop();
+
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(23));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(dst_ud, GenRegister::immud(0));   //exp==9, SHR == 0
+
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(23));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->ADD(tmp, exp, GenRegister::immud(9));
+      p->SHR(dst_ud, low, tmp);   //dst is temp regitster here
+
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(23));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->ADD(tmp, GenRegister::negate(exp), GenRegister::immud(23));
+      p->SHL(mantissa, high, tmp);
+      p->OR(mantissa, mantissa, dst_ud);
+      p->AND(mantissa, mantissa, GenRegister::immud(0x7fffff));
+      p->SHL(high, low, tmp);
+      p->MOV(low, GenRegister::immud(0));
+
+      p->patchJMPI(jip1, (p->n_instruction() - (jip1 + 1)) * 2);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(31));  //update dst where high != 0
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->ADD(exp, exp, GenRegister::immud(159));
+      p->SHL(exp, exp, GenRegister::immud(23));
+      p->OR(dst_ud, exp, mantissa);
+
+      p->CMP(GEN_CONDITIONAL_GE, high, GenRegister::immud(0x80000000));
+      p->ADD(dst_ud, dst_ud, GenRegister::immud(1));
+
+      p->CMP(GEN_CONDITIONAL_EQ, high, GenRegister::immud(0x80000000));
+      p->CMP(GEN_CONDITIONAL_EQ, low, GenRegister::immud(0x0));
+      p->AND(dst_ud, dst_ud, GenRegister::immud(0xfffffffe));
+      p->patchJMPI(jip0, (p->n_instruction() - (jip0 + 1)) * 2);
+
+    p->pop();
+
   }
 
   void GenContext::emitI64ToFloatInstruction(const SelectionInstruction &insn) {
@@ -846,17 +955,21 @@ namespace gbe
     GenRegister dest = ra->genReg(insn.dst(0));
     GenRegister high = ra->genReg(insn.dst(1));
     GenRegister low = ra->genReg(insn.dst(2));
-    GenRegister tmp = ra->genReg(insn.dst(3));
-    GenRegister flagReg = ra->genReg(insn.dst(4));
+    GenRegister exp = ra->genReg(insn.dst(3));
+    GenRegister mantissa = ra->genReg(insn.dst(4));
+    GenRegister tmp = ra->genReg(insn.dst(5));
+    GenRegister tmp_high = ra->genReg(insn.dst(6));
+    GenRegister f0 = checkFlagRegister(ra->genReg(insn.dst(7)));
     loadTopHalf(high, src);
     loadBottomHalf(low, src);
     if(!src.is_signed_int()) {
-      UnsignedI64ToFloat(dest, high, low, tmp);
+      UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, f0);
     } else {
+      p->MOV(tmp_high, high);
       p->push();
       p->curr.predicate = GEN_PREDICATE_NONE;
-      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
-      p->CMP(GEN_CONDITIONAL_GE, high, GenRegister::immud(0x80000000));
+      p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
       p->curr.predicate = GEN_PREDICATE_NORMAL;
       p->NOT(high, high);
       p->NOT(low, low);
@@ -864,15 +977,55 @@ namespace gbe
       addWithCarry(low, low, tmp);
       p->ADD(high, high, tmp);
       p->pop();
-      UnsignedI64ToFloat(dest, high, low, tmp);
+      UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, f0);
       p->push();
-      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.useFlag(f0.flag_nr(), f0.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
       dest.type = GEN_TYPE_UD;
       p->OR(dest, dest, GenRegister::immud(0x80000000));
       p->pop();
     }
   }
 
+
+  void GenContext::emitFloatToI64Instruction(const SelectionInstruction &insn) {
+    GenRegister src = ra->genReg(insn.src(0));
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister high = ra->genReg(insn.dst(1));
+    GenRegister tmp = ra->genReg(insn.dst(2));
+    GenRegister flag0 = checkFlagRegister(ra->genReg(insn.dst(3)));
+
+    if(dst.is_signed_int())
+      high = GenRegister::retype(high, GEN_TYPE_D);
+    GenRegister low = GenRegister::retype(tmp, GEN_TYPE_UD);
+    float c = (1.f / 65536.f) * (1.f / 65536.f);
+    p->MUL(tmp, src, GenRegister::immf(c));
+    p->RNDZ(tmp, tmp);
+    p->MOV(high, tmp);
+    c = 65536.f * 65536.f;
+    p->MOV(tmp, high);  //result may not equal to tmp
+    //mov float to int/uint is sat, so must sub high*0xffffffff
+    p->MUL(tmp, tmp, GenRegister::immf(c));
+    p->ADD(tmp, src, GenRegister::negate(tmp));
+    p->MOV(low, GenRegister::abs(tmp));
+    if(dst.is_signed_int()) {
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.useFlag(flag0.flag_nr(), flag0.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_L, src, GenRegister::immf(0x0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_NEQ, low, GenRegister::immud(0x0));
+      p->ADD(high, high, GenRegister::immd(-1));
+      p->NOT(low, low);
+      p->ADD(low, low, GenRegister::immud(1));
+      p->pop();
+    }
+    storeTopHalf(dst, high);
+    storeBottomHalf(dst, low);
+  }
+
   void GenContext::emitI64CompareInstruction(const SelectionInstruction &insn) {
     GenRegister src0 = ra->genReg(insn.src(0));
     GenRegister src1 = ra->genReg(insn.src(1));
@@ -882,10 +1035,12 @@ namespace gbe
     tmp0.type = (src0.type == GEN_TYPE_L) ? GEN_TYPE_D : GEN_TYPE_UD;
     tmp1.type = (src1.type == GEN_TYPE_L) ? GEN_TYPE_D : GEN_TYPE_UD;
     int flag = p->curr.flag, subFlag = p->curr.subFlag;
-    GenRegister f1 = GenRegister::retype(tmp2, GEN_TYPE_UW),
-                f2 = GenRegister::suboffset(f1, 1),
-                f3 = GenRegister::suboffset(f1, 2),
-                f4 = GenRegister::suboffset(f1, 3);
+    GenRegister f1 = GenRegister::retype(tmp2, GEN_TYPE_UW);
+                f1.width = GEN_WIDTH_1;
+    GenRegister f2 = GenRegister::suboffset(f1, 1);
+    GenRegister f3 = GenRegister::suboffset(f1, 2);
+    GenRegister f4 = GenRegister::suboffset(f1, 3);
+
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
     saveFlag(f4, flag, subFlag);
@@ -962,7 +1117,7 @@ namespace gbe
     GenRegister c = ra->genReg(insn.dst(3));
     GenRegister d = ra->genReg(insn.dst(4));
     GenRegister e = ra->genReg(insn.dst(5));
-    GenRegister flagReg = ra->genReg(insn.dst(6));
+    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(6)));
     loadTopHalf(a, x);
     loadBottomHalf(b, x);
     loadTopHalf(c, y);
@@ -1008,7 +1163,7 @@ namespace gbe
     GenRegister c = ra->genReg(insn.dst(3));
     GenRegister d = ra->genReg(insn.dst(4));
     GenRegister e = ra->genReg(insn.dst(5));
-    GenRegister flagReg = ra->genReg(insn.dst(6));
+    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(6)));
     loadTopHalf(a, x);
     loadBottomHalf(b, x);
     loadTopHalf(c, y);
@@ -1052,10 +1207,10 @@ namespace gbe
     p->curr.predicate = GEN_PREDICATE_NONE;
     p->curr.execWidth = 8;
     p->MOV(dest, src);
-    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 8));
+    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
     if (execWidth == 16) {
-      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 16));
-      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 24));
+      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
+      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
     }
     p->pop();
   }
@@ -1067,13 +1222,13 @@ namespace gbe
     p->curr.execWidth = 8;
     p->MOV(dest, src);
     p->curr.nibControl = 1;
-    p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 4));
+    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
     if (execWidth == 16) {
       p->curr.quarterControl = 1;
       p->curr.nibControl = 0;
-      p->MOV(GenRegister::suboffset(dest, 16), GenRegister::suboffset(src, 8));
+      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
       p->curr.nibControl = 1;
-      p->MOV(GenRegister::suboffset(dest, 24), GenRegister::suboffset(src, 12));
+      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
     }
     p->pop();
   }
@@ -1085,10 +1240,10 @@ namespace gbe
     p->curr.predicate = GEN_PREDICATE_NONE;
     p->curr.execWidth = 8;
     p->MOV(dest, src);
-    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 8));
+    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
     if (execWidth == 16) {
-      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 16));
-      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 24));
+      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
+      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
     }
     p->pop();
   }
@@ -1100,13 +1255,13 @@ namespace gbe
     p->curr.execWidth = 8;
     p->MOV(dest, src);
     p->curr.nibControl = 1;
-    p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 4));
+    p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4));
     if (execWidth == 16) {
       p->curr.quarterControl = 1;
       p->curr.nibControl = 0;
-      p->MOV(GenRegister::suboffset(dest, 16), GenRegister::suboffset(src, 8));
+      p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, 8));
       p->curr.nibControl = 1;
-      p->MOV(GenRegister::suboffset(dest, 24), GenRegister::suboffset(src, 12));
+      p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, 12));
     }
     p->pop();
   }
@@ -1181,8 +1336,8 @@ namespace gbe
     loadBottomHalf(d, y);
     p->push();
     p->curr.predicate = GEN_PREDICATE_NONE;
-    I32FullMult(GenRegister::null(), e, b, c);
-    I32FullMult(GenRegister::null(), f, a, d);
+    I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), e, b, c);
+    I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), f, a, d);
     p->ADD(e, e, f);
     I32FullMult(f, a, b, d);
     p->ADD(e, e, f);
@@ -1208,7 +1363,7 @@ namespace gbe
     GenRegister k = ra->genReg(insn.dst(11));
     GenRegister l = ra->genReg(insn.dst(12));
     GenRegister m = ra->genReg(insn.dst(13));
-    GenRegister flagReg = ra->genReg(insn.dst(14));
+    GenRegister flagReg = checkFlagRegister(ra->genReg(insn.dst(14)));
     GenRegister zero = GenRegister::immud(0),
                 one = GenRegister::immud(1),
                 imm31 = GenRegister::immud(31);
@@ -1302,7 +1457,7 @@ namespace gbe
       p->curr.noMask = 1;
       int jip = -(int)(p->n_instruction() - loop_start + 1) * 2;
       p->JMPI(zero);
-      p->patchJMPI(p->n_instruction()-1, jip);
+      p->patchJMPI(p->n_instruction()-2, jip);
       p->pop();
       // end of loop
     }
@@ -1351,7 +1506,63 @@ namespace gbe
 
   void GenContext::emitBarrierInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister fenceDst = ra->genReg(insn.dst(0));
+    uint32_t barrierType = insn.extra.barrierType;
+    const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+    GenRegister blockIP;
+    uint32_t exeWidth = p->curr.execWidth;
+    ir::LabelIndex label = insn.parent->bb->getNextBlock()->getLabelIndex();
+
+    if (exeWidth == 16)
+      blockIP = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip));
+    else if (exeWidth == 8)
+      blockIP = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
+
+    p->push();
+    /* Set block IP to 0xFFFF and clear the flag0's all bits. to skip all the instructions
+       after the barrier, If there is any lane still remains zero. */
+    p->MOV(blockIP, GenRegister::immuw(0xFFFF));
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+    this->branchPos2.push_back(std::make_pair(label, p->n_instruction()));
+    if (exeWidth == 16)
+      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+    else if (exeWidth == 8)
+      p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+    else
+      NOT_IMPLEMENTED;
+    p->curr.inversePredicate = 1;
+    // If not all channel is set to 1, the barrier is still waiting for other lanes to complete,
+    // jump to next basic block.
+    p->JMPI(GenRegister::immud(0));
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->MOV(GenRegister::flag(0, 0), ra->genReg(GenRegister::uw1grf(ir::ocl::emask)));
+    p->pop();
+
+    p->push();
+    p->curr.useFlag(0, 0);
+    /* Restore the blockIP to current label. */
+    p->MOV(blockIP, GenRegister::immuw(insn.parent->bb->getLabelIndex()));
+    if (barrierType == ir::syncGlobalBarrier) {
+      p->FENCE(fenceDst);
+      p->MOV(fenceDst, fenceDst);
+    }
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    // As only the payload.2 is used and all the other regions are ignored
+    // SIMD8 mode here is safe.
+    p->curr.execWidth = 8;
+    p->curr.physicalFlag = 0;
+    p->curr.noMask = 1;
+    // Copy barrier id from r0.
+    p->AND(src, barrierId, GenRegister::immud(0x0f000000));
+    // A barrier is OK to start the thread synchronization *and* SLM fence
     p->BARRIER(src);
+    // Now we wait for the other threads
+    p->curr.execWidth = 1;
+    p->WAIT();
+    // we executed the barrier then restore the barrier soft mask to initial value.
+    p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), ra->genReg(GenRegister::uw1grf(ir::ocl::notemask)));
+    p->pop();
   }
 
   void GenContext::emitFenceInstruction(const SelectionInstruction &insn) {
@@ -1451,8 +1662,9 @@ namespace gbe
     GenRegister payload = src;
     payload.nr = header + 1;
     payload.subnr = 0;
-
-    p->MOV(payload, src);
+    GBE_ASSERT(src.subnr == 0);
+    if (payload.nr != src.nr)
+      p->MOV(payload, src);
     uint32_t regType = insn.src(0).type;
     uint32_t size = typeSize(regType);
     assert(size <= 4);
@@ -1541,11 +1753,10 @@ namespace gbe
   void GenContext::emitSampleInstruction(const SelectionInstruction &insn) {
     const GenRegister dst = ra->genReg(insn.dst(0));
     const GenRegister msgPayload = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
-    const unsigned char bti = insn.extra.function;
-    const unsigned char sampler = insn.extra.elem;
+    const unsigned char bti = insn.extra.rdbti;
+    const unsigned char sampler = insn.extra.sampler;
     const GenRegister ucoord = ra->genReg(insn.src(4));
     const GenRegister vcoord = ra->genReg(insn.src(5));
-    const GenRegister wcoord = ra->genReg(insn.src(6));
     uint32_t simdWidth = p->curr.execWidth;
     uint32_t coord_cnt = 2;
     p->push();
@@ -1555,8 +1766,8 @@ namespace gbe
     /* Prepare message payload. */
     p->MOV(GenRegister::f8grf(nr , 0), ucoord);
     p->MOV(GenRegister::f8grf(nr + (simdWidth/8), 0), vcoord);
-    if (insn.src(6).reg() != 0) {
-      p->MOV(GenRegister::f8grf(nr + (simdWidth/4), 0), wcoord);
+    if (insn.extra.is3DRead) {
+      p->MOV(GenRegister::f8grf(nr + (simdWidth/4), 0), ra->genReg(insn.src(6)));
       coord_cnt++;
     }
     p->SAMPLE(dst, msgPayload, false, bti, sampler, coord_cnt, simdWidth, -1, 0);
@@ -1596,14 +1807,13 @@ namespace gbe
 
   void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) {
     const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
-    const GenRegister ucoord = ra->genReg(insn.src(insn.extra.elem));
-    const GenRegister vcoord = ra->genReg(insn.src(1 + insn.extra.elem));
-    const GenRegister wcoord = ra->genReg(insn.src(2 + insn.extra.elem));
-    const GenRegister R = ra->genReg(insn.src(3 + insn.extra.elem));
-    const GenRegister G = ra->genReg(insn.src(4 + insn.extra.elem));
-    const GenRegister B = ra->genReg(insn.src(5 + insn.extra.elem));
-    const GenRegister A = ra->genReg(insn.src(6 + insn.extra.elem));
-    const unsigned char bti = insn.extra.function;
+    const GenRegister ucoord = ra->genReg(insn.src(insn.extra.msglen));
+    const GenRegister vcoord = ra->genReg(insn.src(1 + insn.extra.msglen));
+    const GenRegister R = ra->genReg(insn.src(3 + insn.extra.msglen));
+    const GenRegister G = ra->genReg(insn.src(4 + insn.extra.msglen));
+    const GenRegister B = ra->genReg(insn.src(5 + insn.extra.msglen));
+    const GenRegister A = ra->genReg(insn.src(6 + insn.extra.msglen));
+    const unsigned char bti = insn.extra.bti;
 
     p->push();
     uint32_t simdWidth = p->curr.execWidth;
@@ -1641,8 +1851,8 @@ namespace gbe
         p->curr.quarterControl = GEN_COMPRESSION_Q2;
       QUARTER_MOV0(nr + 1, ucoord);
       QUARTER_MOV0(nr + 2, vcoord);
-      if (insn.src(2 + insn.extra.elem).reg() != 0)
-        QUARTER_MOV0(nr + 3, wcoord);
+      if (insn.extra.is3DWrite)
+        QUARTER_MOV0(nr + 3, ra->genReg(insn.src(2 + insn.extra.msglen)));
       QUARTER_MOV1(nr + 5, R);
       QUARTER_MOV1(nr + 6, G);
       QUARTER_MOV1(nr + 7, B);
@@ -1653,23 +1863,6 @@ namespace gbe
     p->pop();
   }
 
-  void GenContext::emitGetImageInfoInstruction(const SelectionInstruction &insn) {
-    const unsigned char bti = insn.extra.function;
-    const unsigned char type = insn.extra.elem;
-    const uint32_t dstNum = ir::GetImageInfoInstruction::getDstNum4Type(type);
-    ir::ImageInfoKey key;
-    key.index = bti;
-    key.type = type;
-
-    uint32_t offset = this->getImageInfoCurbeOffset(key, dstNum * 4) + GEN_REG_SIZE;
-    for(uint32_t i = 0; i < dstNum; i++) {
-      const uint32_t nr = offset / GEN_REG_SIZE;
-      const uint32_t subnr = (offset % GEN_REG_SIZE) / sizeof(uint32_t);
-      p->MOV(ra->genReg(insn.dst(i)), GenRegister::ud1grf(nr, subnr));
-      offset += 32;
-    }
-  }
-
   BVAR(OCL_OUTPUT_REG_ALLOC, false);
   BVAR(OCL_OUTPUT_ASM, false);
   bool GenContext::emitCode(void) {
@@ -1688,9 +1881,20 @@ namespace gbe
     genKernel->insnNum = p->store.size();
     genKernel->insns = GBE_NEW_ARRAY_NO_ARG(GenInstruction, genKernel->insnNum);
     std::memcpy(genKernel->insns, &p->store[0], genKernel->insnNum * sizeof(GenInstruction));
-    if (OCL_OUTPUT_ASM)
-      for (uint32_t insnID = 0; insnID < genKernel->insnNum; ++insnID)
+    if (OCL_OUTPUT_ASM) {
+      std::cout << genKernel->getName() << "'s disassemble begin:" << std::endl;
+      ir::LabelIndex curLabel = (ir::LabelIndex)0;
+      std::cout << "  L0:" << std::endl;
+      for (uint32_t insnID = 0; insnID < genKernel->insnNum; ++insnID) {
+        if (labelPos.find((ir::LabelIndex)(curLabel + 1))->second == insnID) {
+          std::cout << "  L" << curLabel + 1 << ":" << std::endl;
+          curLabel = (ir::LabelIndex)(curLabel + 1);
+        }
+        std::cout << "    (" << std::setw(8) << insnID * 2 << ")  ";
         gen_disasm(stdout, &p->store[insnID]);
+      }
+      std::cout << genKernel->getName() << "'s disassemble end." << std::endl;
+    }
     return true;
   }
 
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 10e0603..6cfc295 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -62,6 +62,8 @@ namespace gbe
     /*! Simd width chosen for the current function */
     INLINE uint32_t getSimdWidth(void) const { return simdWidth; }
     void clearFlagRegister(void);
+    /*! check the flag reg, if is grf, use f0.1 instead */
+    GenRegister checkFlagRegister(GenRegister flagReg);
     /*! Emit the per-lane stack pointer computation */
     void emitStackPointer(void);
     /*! Emit the instructions */
@@ -76,6 +78,19 @@ namespace gbe
     INLINE const ir::Liveness::LiveOut &getLiveOut(const ir::BasicBlock *bb) const {
       return this->liveness->getLiveOut(bb);
     }
+    /*! Get the LiveIn information for the given block */
+    INLINE const ir::Liveness::UEVar &getLiveIn(const ir::BasicBlock *bb) const {
+      return this->liveness->getLiveIn(bb);
+    }
+
+    /*! Get the extra liveOut information for the given block */
+    INLINE const ir::Liveness::LiveOut &getExtraLiveOut(const ir::BasicBlock *bb) const {
+      return this->liveness->getExtraLiveOut(bb);
+    }
+    /*! Get the extra LiveIn information for the given block */
+    INLINE const ir::Liveness::UEVar &getExtraLiveIn(const ir::BasicBlock *bb) const {
+      return this->liveness->getExtraLiveIn(bb);
+    }
 
     void collectShifter(GenRegister dest, GenRegister src);
     void loadTopHalf(GenRegister dest, GenRegister src);
@@ -92,7 +107,7 @@ namespace gbe
     void I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1);
     void I64FullMult(GenRegister dst1, GenRegister dst2, GenRegister dst3, GenRegister dst4, GenRegister x_high, GenRegister x_low, GenRegister y_high, GenRegister y_low);
     void saveFlag(GenRegister dest, int flag, int subFlag);
-    void UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister tmp);
+    void UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister exp, GenRegister mantissa, GenRegister tmp, GenRegister flag);
 
     /*! Final Gen ISA emission helper functions */
     void emitLabelInstruction(const SelectionInstruction &insn);
@@ -110,6 +125,7 @@ namespace gbe
     void emitI64SATADDInstruction(const SelectionInstruction &insn);
     void emitI64SATSUBInstruction(const SelectionInstruction &insn);
     void emitI64ToFloatInstruction(const SelectionInstruction &insn);
+    void emitFloatToI64Instruction(const SelectionInstruction &insn);
     void emitCompareInstruction(const SelectionInstruction &insn);
     void emitJumpInstruction(const SelectionInstruction &insn);
     void emitIndirectMoveInstruction(const SelectionInstruction &insn);
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 27ce58c..ffa38c0 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -125,6 +125,8 @@ enum opcode {
   GEN_OPCODE_ASR = 12,
   GEN_OPCODE_CMP = 16,
   GEN_OPCODE_CMPN = 17,
+  GEN_OPCODE_F32TO16 = 19,
+  GEN_OPCODE_F16TO32 = 20,
   GEN_OPCODE_JMPI = 32,
   GEN_OPCODE_IF = 34,
   GEN_OPCODE_IFF = 35,
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index b0cc931..aaf7dce 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -908,7 +908,7 @@ namespace gbe
       curr.execWidth = 8;
       curr.predicate = GEN_PREDICATE_NONE;
       MOV(r0, src0);
-      MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 4));
+      MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
       curr.predicate = GEN_PREDICATE_NORMAL;
       curr.quarterControl = 0;
       curr.nibControl = 0;
@@ -921,7 +921,7 @@ namespace gbe
         curr.execWidth = 8;
         curr.predicate = GEN_PREDICATE_NONE;
         MOV(r0, GenRegister::suboffset(src0, 8));
-        MOV(GenRegister::suboffset(r0, 8), GenRegister::suboffset(src0, 12));
+        MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12));
         curr.predicate = GEN_PREDICATE_NORMAL;
         curr.quarterControl = 1;
         curr.nibControl = 0;
@@ -940,6 +940,8 @@ namespace gbe
   ALU1(RNDU)
   ALU1(FBH)
   ALU1(FBL)
+  ALU1(F16TO32)
+  ALU1(F32TO16)
   ALU2(SEL)
   ALU1(NOT)
   ALU2(AND)
@@ -1048,13 +1050,42 @@ namespace gbe
 
   void GenEncoder::JMPI(GenRegister src) {
     alu2(this, GEN_OPCODE_JMPI, GenRegister::ip(), GenRegister::ip(), src);
+    NOP();
   }
 
   void GenEncoder::patchJMPI(uint32_t insnID, int32_t jumpDistance) {
     GenInstruction &insn = this->store[insnID];
-    assert(insnID < this->store.size());
-    assert(insn.header.opcode == GEN_OPCODE_JMPI);
-    this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+    GBE_ASSERT(insnID < this->store.size());
+    GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI);
+    if ( jumpDistance > -32769 && jumpDistance < 32768 ) {
+        this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+    } else if ( insn.header.predicate_control == GEN_PREDICATE_NONE ) {
+      // For the conditional jump distance out of S15 range, we need to use an
+      // inverted jmp followed by a add ip, ip, distance to implement.
+      // A little hacky as we need to change the nop instruction to add
+      // instruction manually.
+      // If this is a unconditional jump, we just need to add the IP directly.
+      // FIXME there is an optimization method which we can insert a
+      // ADD instruction on demand. But that will need some extra analysis
+      // for all the branching instruction. And need to adjust the distance
+      // for those branch instruction's start point and end point contains
+      // this instruction.
+      insn.header.opcode = GEN_OPCODE_ADD;
+      this->setDst(&insn, GenRegister::ip());
+      this->setSrc0(&insn, GenRegister::ip());
+      this->setSrc1(&insn, GenRegister::immd((jumpDistance + 2) * 8));
+    } else {
+      insn.header.predicate_inverse ^= 1;
+      this->setSrc1(&insn, GenRegister::immd(2));
+      GenInstruction &insn2 = this->store[insnID+1];
+      GBE_ASSERT(insn2.header.opcode == GEN_OPCODE_NOP);
+      GBE_ASSERT(insnID < this->store.size());
+      insn2.header.predicate_control = GEN_PREDICATE_NONE;
+      insn2.header.opcode = GEN_OPCODE_ADD;
+      this->setDst(&insn2, GenRegister::ip());
+      this->setSrc0(&insn2, GenRegister::ip());
+      this->setSrc1(&insn2, GenRegister::immd(jumpDistance * 8));
+    }
   }
 
   void GenEncoder::CMP(uint32_t conditional, GenRegister src0, GenRegister src1) {
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index d518c4a..13db6ae 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -99,6 +99,8 @@ namespace gbe
     ALU1(RNDE)
     ALU1(RNDD)
     ALU1(RNDU)
+    ALU1(F16TO32)
+    ALU1(F32TO16)
     ALU2(SEL)
     ALU1(NOT)
     ALU2(AND)
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index b33112c..13cbd41 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -9,6 +9,7 @@ DECL_GEN7_SCHEDULE(I64Shift,        20,        4,        2)
 DECL_GEN7_SCHEDULE(I64HADD,         20,        4,        2)
 DECL_GEN7_SCHEDULE(I64RHADD,        20,        4,        2)
 DECL_GEN7_SCHEDULE(I64ToFloat,      20,        4,        2)
+DECL_GEN7_SCHEDULE(FloatToI64,      20,        4,        2)
 DECL_GEN7_SCHEDULE(I64MULHI,        20,        4,        2)
 DECL_GEN7_SCHEDULE(I64MADSAT,       20,        4,        2)
 DECL_GEN7_SCHEDULE(Compare,         20,        4,        2)
@@ -33,7 +34,6 @@ DECL_GEN7_SCHEDULE(Sample,          80,        1,        1)
 DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
 DECL_GEN7_SCHEDULE(SpillReg,        80,        1,        1)
 DECL_GEN7_SCHEDULE(UnSpillReg,      80,        1,        1)
-DECL_GEN7_SCHEDULE(GetImageInfo,    20,        4,        2)
 DECL_GEN7_SCHEDULE(Atomic,          80,        1,        1)
 DECL_GEN7_SCHEDULE(I64MUL,          20,        4,        2)
 DECL_GEN7_SCHEDULE(I64SATADD,       20,        4,        2)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 7eae7ca..54e5ebe 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -316,7 +316,7 @@ namespace gbe
     /*! Implement public class */
     INLINE ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID);
     /*! spill a register (insert spill/unspill instructions) */
-    INLINE void spillReg(ir::Register reg, uint32_t registerPool);
+    INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
     /*! Implement public class */
     INLINE uint32_t getRegNum(void) const { return file.regNum(); }
     /*! Implements public interface */
@@ -426,6 +426,8 @@ namespace gbe
     ALU1(LOAD_INT64_IMM)
     ALU1(RNDZ)
     ALU1(RNDE)
+    ALU1(F16TO32)
+    ALU1(F32TO16)
     ALU2(SEL)
     ALU2(SEL_INT64)
     ALU1(NOT)
@@ -471,7 +473,9 @@ namespace gbe
 #undef ALU3
 #undef I64Shift
     /*! Convert 64-bit integer to 32-bit float */
-    void CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[4]);
+    void CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[7]);
+    /*! Convert 64-bit integer to 32-bit float */
+    void CONVF_TO_I64(Reg dst, Reg src, GenRegister tmp[3]);
     /*! Saturated 64bit x*y + z */
     void I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[10]);
     /*! High 64bit of x*y */
@@ -489,7 +493,7 @@ namespace gbe
     /*! Saturated subtraction of 64-bit integer */
     void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
     /*! Encode a barrier instruction */
-    void BARRIER(GenRegister src);
+    void BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType);
     /*! Encode a barrier instruction */
     void FENCE(GenRegister dst);
     /*! Encode a label instruction */
@@ -539,9 +543,9 @@ namespace gbe
     /*! Encode ternary instructions */
     void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
     /*! Encode sample instructions */
-    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *src, uint32_t srcNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler);
+    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *src, uint32_t srcNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D);
     /*! Encode typed write instructions */
-    void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti);
+    void TYPED_WRITE(GenRegister *src, uint32_t srcNum, GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
     /*! Get image information */
     void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
     /*! Multiply 64-bit integers */
@@ -666,58 +670,136 @@ namespace gbe
     return vector;
   }
 
-  void Selection::Opaque::spillReg(ir::Register spilledReg, uint32_t registerPool) {
-    assert(registerPool != 0);
-    const uint32_t simdWidth = ctx.getSimdWidth();
+  // FIXME, there is a risk need to be fixed here.
+  // as the instruction we spill here is the gen ir level not the final
+  // single instruction. If it will be translated to multiple instructions
+  // at gen_context stage, and as the destination registers and source registers
+  // may be spilled to the same register based on current implementation,
+  // then the source register may be modified within the final instruction and
+  // may lead to incorrect result.
+  bool Selection::Opaque::spillRegs(const SpilledRegs &spilledRegs,
+                                    uint32_t registerPool) {
+    GBE_ASSERT(registerPool != 0);
     const uint32_t dstStart = registerPool + 1;
     const uint32_t srcStart = registerPool + 1;
-    uint32_t ptr = ctx.allocateScratchMem(typeSize(GEN_TYPE_D)*simdWidth);
 
     for (auto &block : blockList)
       for (auto &insn : block.insnList) {
         // spill / unspill insn should be skipped when do spilling
-        if(insn.opcode == SEL_OP_SPILL_REG || insn.opcode == SEL_OP_UNSPILL_REG) continue;
+        if(insn.opcode == SEL_OP_SPILL_REG
+           || insn.opcode == SEL_OP_UNSPILL_REG)
+          continue;
 
         const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
-
+        struct RegSlot {
+          RegSlot(ir::Register _reg, uint8_t _srcID,
+                  bool _isTmp, uint32_t _addr)
+                 : reg(_reg), srcID(_srcID), isTmpReg(_isTmp), addr(_addr)
+          {};
+          ir::Register reg;
+          union {
+            uint8_t srcID;
+            uint8_t dstID;
+          };
+          bool isTmpReg;
+          int32_t addr;
+        };
+        vector <struct RegSlot> regSet;
         for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
           const GenRegister selReg = insn.src(srcID);
           const ir::Register reg = selReg.reg();
-          if(reg == spilledReg && selReg.file == GEN_GENERAL_REGISTER_FILE && selReg.physical == 0) {
-            GBE_ASSERT(srcID < 5);
+          auto it = spilledRegs.find(reg);
+          if(it != spilledRegs.end()
+             && selReg.file == GEN_GENERAL_REGISTER_FILE
+             && selReg.physical == 0) {
+            struct RegSlot regSlot(reg, srcID,
+                                   it->second.isTmpReg,
+                                   it->second.addr);
+            regSet.push_back(regSlot);
+          }
+        }
+
+        if (regSet.size() > 5)
+          return false;
+
+        while(!regSet.empty()) {
+          uint32_t scratchID = regSet.size() - 1;
+          struct RegSlot regSlot = regSet.back();
+          regSet.pop_back();
+          const GenRegister selReg = insn.src(regSlot.srcID);
+          if (!regSlot.isTmpReg) {
+          /* For temporary registers, we don't need to unspill. */
             SelectionInstruction *unspill = this->create(SEL_OP_UNSPILL_REG, 1, 0);
-            unspill->state  = GenInstructionState(simdWidth);
-            unspill->dst(0) = GenRegister(GEN_GENERAL_REGISTER_FILE, srcStart+srcID, 0,
-                                          selReg.type, selReg.vstride, selReg.width, selReg.hstride);
-            GenRegister src = insn.src(srcID);
-            // change nr/subnr, keep other register settings
-            src.nr = srcStart+srcID; src.subnr=0; src.physical=1;
-            insn.src(srcID) = src;
-            unspill->extra.scratchOffset = ptr;
+            unspill->state  = GenInstructionState(ctx.getSimdWidth());
+            unspill->dst(0) = GenRegister(GEN_GENERAL_REGISTER_FILE,
+                                          srcStart + scratchID, 0,
+                                          selReg.type, selReg.vstride,
+                                          selReg.width, selReg.hstride);
+            unspill->extra.scratchOffset = regSlot.addr;
             unspill->extra.scratchMsgHeader = registerPool;
             insn.prepend(*unspill);
           }
-        }
+
+          GenRegister src = insn.src(regSlot.srcID);
+          // change nr/subnr, keep other register settings
+          src.nr = srcStart + scratchID; src.subnr = 0; src.physical = 1;
+          insn.src(regSlot.srcID) = src;
+        };
+
+        /*
+          To save one register, registerPool + 1 was used by both
+          the src0 as source and other operands as payload. To avoid
+          side effect, we use a stack model to push all operands
+          register, and spill the 0th dest at last. As all the spill
+          will be append to the current instruction. Then the last spill
+          instruction will be the first instruction after current
+          instruction. Thus the registerPool + 1 still contain valid
+          data.
+         */
 
         for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
           const GenRegister selReg = insn.dst(dstID);
           const ir::Register reg = selReg.reg();
-          if(reg == spilledReg && selReg.file == GEN_GENERAL_REGISTER_FILE && selReg.physical == 0) {
-            GBE_ASSERT(dstID < 5);
+          auto it = spilledRegs.find(reg);
+          if(it != spilledRegs.end()
+             && selReg.file == GEN_GENERAL_REGISTER_FILE
+             && selReg.physical == 0) {
+            struct RegSlot regSlot(reg, dstID,
+                                   it->second.isTmpReg,
+                                   it->second.addr);
+            regSet.push_back(regSlot);
+          }
+        }
+
+        if (regSet.size() > 5)
+          return false;
+
+        while(!regSet.empty()) {
+          uint32_t scratchID = regSet.size() - 1;
+          struct RegSlot regSlot = regSet.back();
+          regSet.pop_back();
+          const GenRegister selReg = insn.dst(regSlot.dstID);
+          if(!regSlot.isTmpReg) {
+            /* For temporary registers, we don't need to unspill. */
             SelectionInstruction *spill = this->create(SEL_OP_SPILL_REG, 0, 1);
-            spill->state  = GenInstructionState(simdWidth);
-            spill->src(0) =GenRegister(GEN_GENERAL_REGISTER_FILE, dstStart + dstID, 0,
-                                              selReg.type, selReg.vstride, selReg.width, selReg.hstride);
-            GenRegister dst = insn.dst(dstID);
-            // change nr/subnr, keep other register settings
-            dst.physical =1; dst.nr = dstStart+dstID; dst.subnr = 0;
-            insn.dst(dstID)= dst;
-            spill->extra.scratchOffset = ptr;
+            spill->state  = GenInstructionState(ctx.getSimdWidth());
+            spill->src(0) = GenRegister(GEN_GENERAL_REGISTER_FILE,
+                                        dstStart + scratchID, 0,
+                                        selReg.type, selReg.vstride,
+                                        selReg.width, selReg.hstride);
+            spill->extra.scratchOffset = regSlot.addr;
             spill->extra.scratchMsgHeader = registerPool;
             insn.append(*spill);
           }
+
+          GenRegister dst = insn.dst(regSlot.dstID);
+          // change nr/subnr, keep other register settings
+          dst.physical =1; dst.nr = dstStart + scratchID; dst.subnr = 0;
+          insn.dst(regSlot.dstID)= dst;
+          scratchID++;
         }
       }
+    return true;
   }
 
   ir::Register Selection::Opaque::replaceSrc(SelectionInstruction *insn, uint32_t regID) {
@@ -814,9 +896,11 @@ namespace gbe
     insn->index = uint16_t(index);
   }
 
-  void Selection::Opaque::BARRIER(GenRegister src) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 0, 1);
+  void Selection::Opaque::BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 1, 1);
     insn->src(0) = src;
+    insn->dst(0) = fence;
+    insn->extra.barrierType = barrierType;
   }
 
   void Selection::Opaque::FENCE(GenRegister dst) {
@@ -1128,11 +1212,19 @@ namespace gbe
       insn->dst(i + 1) = tmp[i];
   }
 
-  void Selection::Opaque::CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[4]) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVI64_TO_F, 5, 1);
+  void Selection::Opaque::CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[7]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVI64_TO_F, 8, 1);
     insn->dst(0) = dst;
     insn->src(0) = src;
-    for(int i = 0; i < 4; i ++)
+    for(int i = 0; i < 7; i ++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::CONVF_TO_I64(Reg dst, Reg src, GenRegister tmp[3]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVF_TO_I64, 4, 1);
+    insn->dst(0) = dst;
+    insn->src(0) = src;
+    for(int i = 0; i < 3; i ++)
       insn->dst(i + 1) = tmp[i];
   }
 
@@ -1319,7 +1411,7 @@ namespace gbe
   void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum,
                                  GenRegister *src, uint32_t srcNum,
                                  GenRegister *msgPayloads, uint32_t msgNum,
-                                 uint32_t bti, uint32_t sampler) {
+                                 uint32_t bti, uint32_t sampler, bool is3D) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum + srcNum);
     SelectionVector *dstVector = this->appendVector();
     SelectionVector *msgVector = this->appendVector();
@@ -1342,8 +1434,9 @@ namespace gbe
     msgVector->isSrc = 1;
     msgVector->reg = &insn->src(0);
 
-    insn->extra.function = bti;
-    insn->extra.elem = sampler;
+    insn->extra.rdbti = bti;
+    insn->extra.sampler = sampler;
+    insn->extra.is3DRead = is3D;
   }
 
   ///////////////////////////////////////////////////////////////////////////
@@ -1357,7 +1450,7 @@ namespace gbe
 
   void Selection::Opaque::TYPED_WRITE(GenRegister *src, uint32_t srcNum,
                                       GenRegister *msgs, uint32_t msgNum,
-                                      uint32_t bti) {
+                                      uint32_t bti, bool is3D) {
     uint32_t elemID = 0;
     uint32_t i;
     SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum + srcNum);
@@ -1368,25 +1461,15 @@ namespace gbe
     for (i = 0; i < srcNum; ++i, ++elemID)
       insn->src(elemID) = src[i];
 
-    insn->extra.function = bti;
-    insn->extra.elem = msgNum;
+    insn->extra.bti = bti;
+    insn->extra.msglen = msgNum;
+    insn->extra.is3DWrite = is3D;
     // Sends require contiguous allocation
     msgVector->regNum = msgNum;
     msgVector->isSrc = 1;
     msgVector->reg = &insn->src(0);
   }
 
-  void Selection::Opaque::GET_IMAGE_INFO(uint32_t infoType, GenRegister *dst,
-                                    uint32_t dstNum, uint32_t bti) {
-    SelectionInstruction *insn = this->appendInsn(SEL_OP_GET_IMAGE_INFO, dstNum, 0);
-
-    for(uint32_t i = 0; i < dstNum; ++i)
-      insn->dst(i) = dst[i];
-
-    insn->extra.function = bti;
-    insn->extra.elem = infoType;
-  }
-
   Selection::~Selection(void) { GBE_DELETE(this->opaque); }
 
   void Selection::select(void) {
@@ -1425,8 +1508,8 @@ namespace gbe
   ir::Register Selection::replaceDst(SelectionInstruction *insn, uint32_t regID) {
     return this->opaque->replaceDst(insn, regID);
   }
-  void Selection::spillReg(ir::Register reg, uint32_t registerPool) {
-    this->opaque->spillReg(reg, registerPool);
+  bool Selection::spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool) {
+    return this->opaque->spillRegs(spilledRegs, registerPool);
   }
 
   SelectionInstruction *Selection::create(SelectionOpcode opcode, uint32_t dstNum, uint32_t srcNum) {
@@ -1458,6 +1541,7 @@ namespace gbe
       case TYPE_U8:  return GenRegister::immuw(imm.data.u8);
       case TYPE_S8:  return GenRegister::immw(imm.data.s8);
       case TYPE_DOUBLE: return GenRegister::immdf(imm.data.f64);
+      case TYPE_BOOL: return GenRegister::immuw(-imm.data.b);  //return 0xffff when true
       default: NOT_SUPPORTED; return GenRegister::immuw(0);
     }
   }
@@ -1502,6 +1586,8 @@ namespace gbe
         return ir::TYPE_U32;
       if (insnType == ir::TYPE_S16 || insnType == ir::TYPE_U16)
         return insnType;
+      if (insnType == ir::TYPE_BOOL)
+        return ir::TYPE_U16;
       return ir::TYPE_FLOAT;
     }
 
@@ -1522,7 +1608,25 @@ namespace gbe
           }
           break;
         case ir::OP_MOV:
-          if (dst.isdf()) {
+          if(insn.getType() == ir::TYPE_BOOL) {
+            GenRegister flagReg;
+            uint32_t predicate = sel.curr.predicate;
+            sel.push();
+              sel.curr.execWidth = 1;
+              sel.curr.predicate = GEN_PREDICATE_NONE;
+              sel.curr.noMask = 1;
+              if(predicate == GEN_PREDICATE_NONE)
+                sel.MOV(dst, src);
+              else {
+                if(sel.curr.physicalFlag)
+                  flagReg = GenRegister::flag(sel.curr.flag, sel.curr.subFlag);
+                else
+                  flagReg = sel.selReg(ir::Register(sel.curr.flagIndex), ir::TYPE_U16);
+
+                sel.AND(dst, flagReg, src);
+              }
+            sel.pop();
+          } else if (dst.isdf()) {
             ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
             sel.MOV_DF(dst, src, sel.selReg(r));
           } else
@@ -1537,6 +1641,7 @@ namespace gbe
         case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break;
         case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break;
         case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break;
+        case ir::OP_EXP: sel.MATH(dst, GEN_MATH_FUNCTION_EXP, src); break;
         case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break;
         case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break;
         case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break;
@@ -1656,14 +1761,17 @@ namespace gbe
       SelectionDAG *dag1 = dag.child[1];
 
       // Right source can always be an immediate
-      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
+      //logica ops of bool shouldn't use 0xffff, may use flag reg, so can't optimize
+      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI &&
+          canGetRegisterFromImmediate(dag1->insn) && type != TYPE_BOOL) {
         const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
         src0 = sel.selReg(insn.getSrc(0), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
         if (dag0) dag0->isRoot = 1;
       }
       // Left source cannot be immediate but it is OK if we can commute
-      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn)) {
+      else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && insn.commutes() && dag0->insn.getOpcode() == OP_LOADI &&
+               canGetRegisterFromImmediate(dag0->insn) && type != TYPE_BOOL) {
         const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
         src0 = sel.selReg(insn.getSrc(1), type);
         src1 = getRegisterFromImmediate(childInsn.getImmediate());
@@ -1856,6 +1964,9 @@ namespace gbe
     {
       using namespace ir;
 
+      // XXX TODO: we need a clean support of FP_CONTRACT to remove below line 'return false'
+      // if 'pragma FP_CONTRACT OFF' is used in cl kernel, we should not do mad optimization.
+      return false;
       // MAD tend to increase liveness of the sources (since there are three of
       // them). TODO refine this strategy. Well, we should be able at least to
       // evaluate per basic block register pressure and selectively enable
@@ -1931,6 +2042,7 @@ namespace gbe
       // OK, we merge the instructions
       const ir::CompareInstruction &cmpInsn = cast<CompareInstruction>(cmp->insn);
       const ir::Opcode opcode = cmpInsn.getOpcode();
+      if(opcode == OP_ORD) return false;
       const uint32_t genCmp = getGenCompare(opcode);
 
       // Like for regular selects, we need a temporary since we cannot predicate
@@ -2195,29 +2307,25 @@ namespace gbe
     {
       using namespace ir;
       const ir::Register reg = sel.reg(FAMILY_DWORD);
-
+      const GenRegister barrierMask = sel.selReg(ocl::barriermask, TYPE_BOOL);
+      const GenRegister tempFlag = sel.selReg(sel.reg(FAMILY_BOOL), TYPE_BOOL);
+      const GenRegister flagReg = GenRegister::flag(0, 0);
       const uint32_t params = insn.getParameters();
-      if(params == syncGlobalBarrier) {
-        const ir::Register fenceDst = sel.reg(FAMILY_DWORD);
-        sel.FENCE(sel.selReg(fenceDst, ir::TYPE_U32));
-      }
 
       sel.push();
         sel.curr.predicate = GEN_PREDICATE_NONE;
-
-        // As only the payload.2 is used and all the other regions are ignored
-        // SIMD8 mode here is safe.
-        sel.curr.execWidth = 8;
-        sel.curr.physicalFlag = 0;
         sel.curr.noMask = 1;
-        // Copy barrier id from r0.
-        sel.AND(GenRegister::ud8grf(reg), GenRegister::ud1grf(ir::ocl::barrierid), GenRegister::immud(0x0f000000));
-
-        // A barrier is OK to start the thread synchronization *and* SLM fence
-        sel.BARRIER(GenRegister::f8grf(reg));
-        // Now we wait for the other threads
         sel.curr.execWidth = 1;
-        sel.WAIT();
+        sel.OR(barrierMask, flagReg, barrierMask);
+        sel.MOV(tempFlag, barrierMask);
+      sel.pop();
+
+      // A barrier is OK to start the thread synchronization *and* SLM fence
+      sel.push();
+      //sel.curr.predicate = GEN_PREDICATE_NONE;
+      sel.curr.flagIndex = (uint16_t)tempFlag.value.reg;
+      sel.curr.physicalFlag = 0;
+      sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
       sel.pop();
       return true;
     }
@@ -2480,18 +2588,33 @@ namespace gbe
       const ir::CompareInstruction &insn = cast<CompareInstruction>(dag.insn);
       const Opcode opcode = insn.getOpcode();
       const Type type = insn.getType();
-      const uint32_t genCmp = getGenCompare(opcode);
       const Register dst = insn.getDst(0);
+      Register tmpDst;
+
+      const ir::BasicBlock *insnBlock = insn.getParent();
+      const ir::Liveness &liveness = sel.ctx.getLiveness();
+      const ir::Liveness::UEVar &livein = liveness.getLiveIn(insnBlock);
+      if (!livein.contains(dst))
+        tmpDst = dst;
+      else
+        tmpDst = sel.reg(FAMILY_BOOL);
 
       // Limit the compare to the active lanes. Use the same compare as for f0.0
       sel.push();
         const LabelIndex label = insn.getParent()->getLabelIndex();
         const GenRegister blockip = sel.selReg(ocl::blockip, TYPE_U16);
         const GenRegister labelReg = GenRegister::immuw(label);
+
         sel.curr.predicate = GEN_PREDICATE_NONE;
         sel.curr.physicalFlag = 0;
-        sel.curr.flagIndex = uint16_t(dst);
-        sel.CMP(GEN_CONDITIONAL_LE, blockip, labelReg);
+        sel.curr.flagIndex = uint16_t(tmpDst);
+        if (tmpDst != dst) {
+          sel.CMP(GEN_CONDITIONAL_G, blockip, labelReg);
+          sel.curr.execWidth = 1;
+          sel.AND(sel.selReg(dst, TYPE_BOOL), sel.selReg(dst, TYPE_BOOL), sel.selReg(tmpDst, TYPE_BOOL));
+          sel.XOR(sel.selReg(tmpDst, TYPE_BOOL), sel.selReg(tmpDst, TYPE_BOOL), GenRegister::immuw(0xFFFF));
+        } else
+          sel.CMP(GEN_CONDITIONAL_LE, blockip, labelReg);
       sel.pop();
 
       // Look for immediate values for the right source
@@ -2500,10 +2623,14 @@ namespace gbe
       SelectionDAG *dag1 = dag.child[1];
 
       // Right source can always be an immediate
-      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag1->insn)) {
+      if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI &&
+          canGetRegisterFromImmediate(dag1->insn) && opcode != OP_ORD) {
         const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
         src0 = sel.selReg(insn.getSrc(0), type);
-        src1 = getRegisterFromImmediate(childInsn.getImmediate());
+        Immediate imm = childInsn.getImmediate();
+        if(imm.type != type)
+          imm.type = type;
+        src1 = getRegisterFromImmediate(imm);
         if (dag0) dag0->isRoot = 1;
       } else {
         src0 = sel.selReg(insn.getSrc(0), type);
@@ -2513,19 +2640,105 @@ namespace gbe
 
       sel.push();
         sel.curr.physicalFlag = 0;
-        sel.curr.flagIndex = uint16_t(dst);
+        sel.curr.flagIndex = uint16_t(tmpDst);
         if (type == TYPE_S64 || type == TYPE_U64) {
           GenRegister tmp[3];
           for(int i=0; i<3; i++)
             tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
-          sel.I64CMP(genCmp, src0, src1, tmp);
+          sel.I64CMP(getGenCompare(opcode), src0, src1, tmp);
+        } else if(opcode == OP_ORD) {
+          sel.CMP(GEN_CONDITIONAL_EQ, src0, src0);
+          sel.CMP(GEN_CONDITIONAL_EQ, src1, src1);
         } else
-          sel.CMP(genCmp, src0, src1);
+          sel.CMP(getGenCompare(opcode), src0, src1);
       sel.pop();
+      if (tmpDst != dst) {
+        sel.push();
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.execWidth = 1;
+          sel.OR(sel.selReg(dst, TYPE_U16), sel.selReg(dst, TYPE_U16), sel.selReg(tmpDst, TYPE_U16));
+        sel.pop();
+      }
       return true;
     }
   };
 
+  /*! Bit cast instruction pattern */
+  DECL_PATTERN(BitCastInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::BitCastInstruction &insn) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const uint32_t dstNum = insn.getDstNum();
+      const uint32_t srcNum = insn.getSrcNum();
+      int index = 0, multiple, narrowNum;
+      bool narrowDst;
+      Type narrowType;
+
+      if(dstNum > srcNum) {
+        multiple = dstNum / srcNum;
+        narrowType = dstType;
+        narrowNum = dstNum;
+        narrowDst = 1;
+      } else {
+        multiple = srcNum / dstNum;
+        narrowType = srcType;
+        narrowNum = srcNum;
+        narrowDst = 0;
+      }
+
+      for(int i = 0; i < narrowNum; i++, index++) {
+        GenRegister narrowReg, wideReg;
+        if(narrowDst) {
+          narrowReg = sel.selReg(insn.getDst(i), narrowType);
+          wideReg = sel.selReg(insn.getSrc(index/multiple), narrowType);  //retype to narrow type
+        } else {
+          wideReg = sel.selReg(insn.getDst(index/multiple), narrowType);
+          narrowReg = sel.selReg(insn.getSrc(i), narrowType);  //retype to narrow type
+        }
+        if(wideReg.hstride != GEN_VERTICAL_STRIDE_0) {
+          if(multiple == 2) {
+            wideReg = GenRegister::unpacked_uw(wideReg.reg());
+            wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
+          } else if(multiple == 4) {
+            wideReg = GenRegister::unpacked_ub(wideReg.reg());
+            wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
+          } else if(multiple == 8) {  //need to specail handle long to char
+            GBE_ASSERT(multiple == 8);
+          }
+        }
+        if(index % multiple) {
+          wideReg = GenRegister::offset(wideReg, 0, (index % multiple) * typeSize(wideReg.type));
+          wideReg.subphysical = 1;
+        }
+        GenRegister xdst = narrowDst ? narrowReg : wideReg;
+        GenRegister xsrc = narrowDst ? wideReg : narrowReg;
+
+        if((srcType == TYPE_S64 || srcType == TYPE_U64 || srcType == TYPE_DOUBLE) ||
+           (dstType == TYPE_S64 || dstType == TYPE_U64 || dstType == TYPE_DOUBLE)) {
+          const int simdWidth = sel.curr.execWidth;
+          sel.push();
+            sel.curr.execWidth = 8;
+            xdst.subphysical = 1;
+            xsrc.subphysical = 1;
+            for(int i = 0; i < simdWidth/4; i ++) {
+              sel.curr.chooseNib(i);
+              sel.MOV(xdst, xsrc);
+              xdst = GenRegister::offset(xdst, 0, 4 * typeSize(getGenType(dstType)));
+              xsrc = GenRegister::offset(xsrc, 0, 4 * typeSize(getGenType(srcType)));
+            }
+          sel.pop();
+        } else
+          sel.MOV(xdst, xsrc);
+      }
+
+      return true;
+    }
+    DECL_CTOR(BitCastInstruction, 1, 1);
+  };
+
   /*! Convert instruction pattern */
   DECL_PATTERN(ConvertInstruction)
   {
@@ -2538,14 +2751,22 @@ namespace gbe
       const RegisterFamily srcFamily = getFamily(srcType);
       const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
       const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+      const Opcode opcode = insn.getOpcode();
 
-      if(insn.getOpcode() == ir::OP_SAT_CVT) {
+      if(opcode == ir::OP_SAT_CVT) {
         sel.push();
         sel.curr.saturate = 1;
       }
 
       // We need two instructions to make the conversion
-      if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && (srcFamily == FAMILY_DWORD || srcFamily == FAMILY_QWORD)) {
+      if (opcode == OP_F16TO32) {
+        sel.F16TO32(dst, src);
+      } else if (opcode == OP_F32TO16) {
+        GenRegister unpacked;
+        unpacked = GenRegister::unpacked_uw(sel.reg(FAMILY_DWORD));
+        sel.F32TO16(unpacked, src);
+        sel.MOV(dst, unpacked);
+      } else if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && (srcFamily == FAMILY_DWORD || srcFamily == FAMILY_QWORD)) {
         GenRegister unpacked;
         if (dstFamily == FAMILY_WORD) {
           const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
@@ -2567,12 +2788,11 @@ namespace gbe
       } else if ((dstType == ir::TYPE_S32 || dstType == ir::TYPE_U32) && srcFamily == FAMILY_QWORD) {
         sel.CONVI64_TO_I(dst, src);
       } else if (dstType == ir::TYPE_FLOAT && srcFamily == FAMILY_QWORD) {
-        GenRegister tmp[4];
-        for(int i=0; i<3; i++) {
-          tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
-          tmp[i].type = GEN_TYPE_UD;
+        GenRegister tmp[7];
+        for(int i=0; i<6; i++) {
+          tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
         }
-        tmp[3] = sel.selReg(sel.reg(FAMILY_BOOL));
+        tmp[6] = sel.selReg(sel.reg(FAMILY_BOOL), TYPE_BOOL);
         sel.CONVI64_TO_F(dst, src, tmp);
       } else if (dst.isdf()) {
         ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
@@ -2580,8 +2800,14 @@ namespace gbe
       } else if (dst.isint64()) {
         switch(src.type) {
           case GEN_TYPE_F:
-            sel.CONVF_TO_I64(dst, src, sel.selReg(sel.reg(FAMILY_DWORD)));
+          {
+            GenRegister tmp[3];
+            tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+            tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_FLOAT);
+            tmp[2] = sel.selReg(sel.reg(FAMILY_BOOL), TYPE_BOOL);
+            sel.CONVF_TO_I64(dst, src, tmp);
             break;
+          }
           case GEN_TYPE_DF:
             NOT_IMPLEMENTED;
           default:
@@ -2590,7 +2816,7 @@ namespace gbe
       } else
         sel.MOV(dst, src);
 
-      if(insn.getOpcode() == ir::OP_SAT_CVT)
+      if(opcode == ir::OP_SAT_CVT)
         sel.pop();
 
       return true;
@@ -2704,6 +2930,11 @@ namespace gbe
           sel.I64MADSAT(dst, src0, src1, src2, tmp);
           break;
          }
+        case OP_MAD:
+         {
+          sel.MAD(dst, src2, src0, src1);
+          break;
+         }
         default:
           NOT_IMPLEMENTED;
       }
@@ -2741,6 +2972,14 @@ namespace gbe
       if (sel.ctx.hasJIP(&insn)) {
         const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
         sel.push();
+
+          sel.curr.noMask = 1;
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
+          GenRegister flagReg = GenRegister::flag(0, 0);
+          sel.AND(flagReg, flagReg, emaskReg);
+
           if (simdWidth == 8)
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
           else if (simdWidth == 16)
@@ -2748,10 +2987,8 @@ namespace gbe
           else
             NOT_IMPLEMENTED;
           sel.curr.inversePredicate = 1;
-          sel.curr.execWidth = 1;
           sel.curr.flag = 0;
           sel.curr.subFlag = 0;
-          sel.curr.noMask = 1;
           sel.JMPI(GenRegister::immd(0), jip);
         sel.pop();
       }
@@ -2766,14 +3003,8 @@ namespace gbe
     {
       using namespace ir;
       GenRegister msgPayloads[4];
-      GenRegister dst[insn.getDstNum()], src[insn.getSrcNum() - 2];
+      GenRegister dst[insn.getDstNum()], src[insn.getSrcNum()];
       uint32_t srcNum = insn.getSrcNum();
-      uint32_t samplerOffset = 0;
-      if (srcNum == 6) {
-      /* We have the clamp border workaround. */
-        samplerOffset = insn.getSrc(srcNum - 1).value() * 8;
-        srcNum--;
-      }
 
       for( int i = 0; i < 4; ++i)
         msgPayloads[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
@@ -2781,15 +3012,14 @@ namespace gbe
       for (uint32_t valueID = 0; valueID < insn.getDstNum(); ++valueID)
         dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
 
-      for (uint32_t valueID = 0; valueID < srcNum - 2; ++valueID)
-        src[valueID] = sel.selReg(insn.getSrc(valueID + 2), insn.getSrcType());
+      for (uint32_t valueID = 0; valueID < srcNum; ++valueID)
+        src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
 
-      uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
-                       (insn.getSrc(SampleInstruction::SURFACE_BTI));
-      uint32_t sampler = sel.ctx.getFunction().getSamplerSet()->getIdx
-                           (insn.getSrc(SampleInstruction::SAMPLER_BTI)) + samplerOffset;
+      uint32_t bti = insn.getImageIndex();
+      /* We have the clamp border workaround. */
+      uint32_t sampler = insn.getSamplerIndex() + insn.getSamplerOffset() * 8;
 
-      sel.SAMPLE(dst, insn.getDstNum(), src, srcNum - 2, msgPayloads, 4, bti, sampler);
+      sel.SAMPLE(dst, insn.getDstNum(), src, srcNum, msgPayloads, 4, bti, sampler, insn.is3D());
       return true;
     }
     DECL_CTOR(SampleInstruction, 1, 1);
@@ -2802,25 +3032,24 @@ namespace gbe
     {
       using namespace ir;
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
-      uint32_t valueID = 0;
+      uint32_t valueID;
       GenRegister msgs[9]; // (header + U + V + R + LOD + 4)
       GenRegister src[insn.getSrcNum()];
       uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
-      uint32_t coordNum = (insn.getSrcNum() == 7) ? 2 : 3;
+      uint32_t coordNum = 3;
 
       for(uint32_t i = 0; i < msgNum; i++)
         msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
 
       // u, v, w coords should use coord type.
-      for (; valueID < coordNum; ++valueID)
-        src[valueID] = sel.selReg(insn.getSrc(valueID + 1), insn.getCoordType());
+      for (valueID = 0; valueID < coordNum; ++valueID)
+        src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getCoordType());
 
-      for (; (valueID + 1) < insn.getSrcNum(); ++valueID)
-        src[valueID] = sel.selReg(insn.getSrc(valueID + 1), insn.getSrcType());
+      for (; valueID < insn.getSrcNum(); ++valueID)
+        src[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
 
-      uint32_t bti = sel.ctx.getFunction().getImageSet()->getIdx
-                       (insn.getSrc(TypedWriteInstruction::SURFACE_BTI));
-      sel.TYPED_WRITE(src, insn.getSrcNum() - 1, msgs, msgNum, bti);
+      uint32_t bti = insn.getImageIndex();
+      sel.TYPED_WRITE(src, insn.getSrcNum(), msgs, msgNum, bti, insn.is3D());
       return true;
     }
     DECL_CTOR(TypedWriteInstruction, 1, 1);
@@ -2850,7 +3079,7 @@ namespace gbe
       using namespace ir;
       GenRegister dst, src;
       dst = sel.selReg(insn.getDst(0), TYPE_U16);
-      src = GenRegister::offset(GenRegister::uw1grf(ocl::samplerinfo), 0, sel.ctx.getFunction().getSamplerSet()->getIdx(insn.getSrc(0)) * 2);
+      src = GenRegister::offset(GenRegister::uw1grf(insn.getSrc(0)), 0, insn.getSamplerIndex() * 2);
       src.subphysical = 1;
       sel.MOV(dst, src);
       return true;
@@ -2861,6 +3090,38 @@ namespace gbe
   /*! Branch instruction pattern */
   DECL_PATTERN(BranchInstruction)
   {
+
+    // Get active pred.
+    const ir::Register getActivePred(Selection::Opaque &sel,
+                       const ir::BranchInstruction &insn,
+                       const ir::Register pred) const
+    {
+        using namespace ir;
+        GenRegister flagReg;
+        Register activePred;
+        const ir::BasicBlock *insnBlock = insn.getParent();
+        const ir::Liveness &liveness = sel.ctx.getLiveness();
+        const ir::Liveness::UEVar &livein = liveness.getLiveIn(insnBlock);
+       
+        /* If the pred is not in the livein set, then this pred should be defined
+           in this block and we don't need to validate it. */ 
+        if (!livein.contains(pred))
+          return pred;
+
+        activePred = sel.reg(FAMILY_BOOL);
+        sel.push();
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.execWidth = 1;
+          sel.curr.noMask = 1;
+          if(sel.curr.physicalFlag)
+             flagReg = GenRegister::flag(sel.curr.flag, sel.curr.subFlag);
+          else
+             flagReg = sel.selReg(ir::Register(sel.curr.flagIndex), ir::TYPE_U16);
+          sel.AND(sel.selReg(activePred, TYPE_U16), flagReg, sel.selReg(pred, TYPE_U16));
+        sel.pop();
+        return activePred;
+    }
+
     void emitForwardBranch(Selection::Opaque &sel,
                            const ir::BranchInstruction &insn,
                            ir::LabelIndex dst,
@@ -2878,11 +3139,12 @@ namespace gbe
 
       if (insn.isPredicated() == true) {
         const Register pred = insn.getPredicateIndex();
+        const Register activePred = getActivePred(sel, insn, pred);
 
         // Update the PcIPs
         sel.push();
           sel.curr.physicalFlag = 0;
-          sel.curr.flagIndex = uint16_t(pred);
+          sel.curr.flagIndex = uint16_t(activePred);
           sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
         sel.pop();
 
@@ -2891,9 +3153,12 @@ namespace gbe
         // It is slightly more complicated than for backward jump. We check that
         // all PcIPs are greater than the next block IP to be sure that we can
         // jump
+        // We set all the inactive channel to 1 as the GEN_PREDICATE_ALIGN1_ALL8/16
+        // will check those bits as well.
+
         sel.push();
           sel.curr.physicalFlag = 0;
-          sel.curr.flagIndex = uint16_t(pred);
+          sel.curr.flagIndex = uint16_t(activePred);
           sel.curr.predicate = GEN_PREDICATE_NONE;
           sel.CMP(GEN_CONDITIONAL_G, ip, GenRegister::immuw(nextLabel));
 
@@ -2901,14 +3166,19 @@ namespace gbe
           // XXX TODO: For group size not aligned to simdWidth, ALL8/16h may not
           // work correct, as flag register bits mapped to non-active lanes tend
           // to be zero.
+
+          sel.curr.execWidth = 1;
+          sel.curr.noMask = 1;
+          GenRegister notEmaskReg = GenRegister::uw1grf(ocl::notemask);
+          sel.OR(sel.selReg(activePred, TYPE_U16), sel.selReg(activePred, TYPE_U16), notEmaskReg);
+
           if (simdWidth == 8)
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
           else if (simdWidth == 16)
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
           else
             NOT_SUPPORTED;
-          sel.curr.execWidth = 1;
-          sel.curr.noMask = 1;
+
           sel.JMPI(GenRegister::immd(0), jip);
         sel.pop();
 
@@ -2942,6 +3212,7 @@ namespace gbe
 
       if (insn.isPredicated() == true) {
         const Register pred = insn.getPredicateIndex();
+        const Register activePred = getActivePred(sel, insn, pred);
 
         // Update the PcIPs for all the branches. Just put the IPs of the next
         // block. Next instruction will properly reupdate the IPs of the lanes
@@ -2952,9 +3223,17 @@ namespace gbe
         sel.push();
           // Re-update the PcIPs for the branches that takes the backward jump
           sel.curr.physicalFlag = 0;
-          sel.curr.flagIndex = uint16_t(pred);
+          sel.curr.flagIndex = uint16_t(activePred);
           sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
 
+        // We clear all the inactive channel to 0 as the GEN_PREDICATE_ALIGN1_ANY8/16
+        // will check those bits as well.
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.execWidth = 1;
+          sel.curr.noMask = 1;
+          GenRegister emaskReg = GenRegister::uw1grf(ocl::emask);
+          sel.AND(sel.selReg(activePred, TYPE_U16), sel.selReg(activePred, TYPE_U16), emaskReg);
+
           // Branch to the jump target
           if (simdWidth == 8)
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
@@ -2962,8 +3241,6 @@ namespace gbe
             sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
           else
             NOT_SUPPORTED;
-          sel.curr.execWidth = 1;
-          sel.curr.noMask = 1;
           sel.JMPI(GenRegister::immd(0), jip);
         sel.pop();
 
@@ -3021,6 +3298,7 @@ namespace gbe
     this->insert<StoreInstructionPattern>();
     this->insert<SelectInstructionPattern>();
     this->insert<CompareInstructionPattern>();
+    this->insert<BitCastInstructionPattern>();
     this->insert<ConvertInstructionPattern>();
     this->insert<AtomicInstructionPattern>();
     this->insert<TernaryInstructionPattern>();
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 2422b2b..09e6762 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -30,6 +30,7 @@
 #include "backend/gen_register.hpp"
 #include "backend/gen_encoder.hpp"
 #include "backend/gen_context.hpp"
+#include "backend/gen_reg_allocation.hpp"
 #include "sys/vector.hpp"
 #include "sys/intrusive_list.hpp"
 
@@ -111,6 +112,17 @@ namespace gbe
         uint16_t scratchOffset;
         uint16_t scratchMsgHeader;
       };
+      struct {
+        uint16_t bti:8;
+        uint16_t msglen:5;
+        uint16_t is3DWrite:1;
+      };
+      struct {
+        uint16_t rdbti:8;
+        uint16_t sampler:5;
+        uint16_t is3DRead:1;
+      };
+      uint32_t barrierType;
     } extra;
     /*! Gen opcode */
     uint8_t opcode;
@@ -174,7 +186,6 @@ namespace gbe
 
   /*! Owns the selection engine */
   class GenContext;
-
   /*! Selection engine produces the pre-ISA instruction blocks */
   class Selection
   {
@@ -202,7 +213,7 @@ namespace gbe
     /*! Replace a destination to the returned temporary register */
     ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID);
     /*! spill a register (insert spill/unspill instructions) */
-    void spillReg(ir::Register reg, uint32_t registerPool);
+    bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
     /*! Create a new selection instruction */
     SelectionInstruction *create(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
     /*! List of emitted blocks */
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 4499006..e44b9d4 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -10,6 +10,8 @@ DECL_SELECTION_IR(RNDE, UnaryInstruction)
 DECL_SELECTION_IR(RNDD, UnaryInstruction)
 DECL_SELECTION_IR(RNDU, UnaryInstruction)
 DECL_SELECTION_IR(FRC, UnaryInstruction)
+DECL_SELECTION_IR(F16TO32, UnaryInstruction)
+DECL_SELECTION_IR(F32TO16, UnaryInstruction)
 DECL_SELECTION_IR(SEL, BinaryInstruction)
 DECL_SELECTION_IR(SEL_INT64, BinaryInstruction)
 DECL_SELECTION_IR(AND, BinaryInstruction)
@@ -58,7 +60,6 @@ DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
 DECL_SELECTION_IR(DWORD_GATHER, DWordGatherInstruction)
 DECL_SELECTION_IR(SAMPLE, SampleInstruction)
 DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
-DECL_SELECTION_IR(GET_IMAGE_INFO, GetImageInfoInstruction)
 DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
 DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction)
 DECL_SELECTION_IR(MUL_HI, BinaryWithTempInstruction)
@@ -73,7 +74,7 @@ DECL_SELECTION_IR(UPSAMPLE_SHORT, BinaryInstruction)
 DECL_SELECTION_IR(UPSAMPLE_INT, BinaryInstruction)
 DECL_SELECTION_IR(UPSAMPLE_LONG, BinaryInstruction)
 DECL_SELECTION_IR(CONVI_TO_I64, UnaryWithTempInstruction)
-DECL_SELECTION_IR(CONVF_TO_I64, UnaryWithTempInstruction)
 DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
 DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
+DECL_SELECTION_IR(CONVF_TO_I64, FloatToI64Instruction)
 DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction)
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 781152d..22f4aa1 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -108,6 +108,7 @@ namespace gbe {
         break;
       }
       GBE_DELETE(ctx);
+      fn->getImageSet()->clearInfo();
     }
 
     // XXX spill must be implemented
@@ -131,16 +132,33 @@ namespace gbe {
     return reinterpret_cast<gbe_program>(program);
   }
 
+  static size_t genProgramSerializeToBinary(gbe_program program, char **binary) {
+    using namespace gbe;
+    size_t sz;
+    std::ostringstream oss;
+    GenProgram *prog = (GenProgram*)program;
+
+    if ((sz = prog->serializeToBin(oss)) == 0) {
+      *binary = 0;
+      return 0;
+    }
+
+    *binary = (char *)malloc(sizeof(char) * sz);
+    memcpy(*binary, oss.str().c_str(), sz*sizeof(char));
+    return sz;
+  }
+
   static gbe_program genProgramNewFromLLVM(const char *fileName,
                                            size_t stringSize,
                                            char *err,
-                                           size_t *errSize)
+                                           size_t *errSize,
+                                           int optLevel)
   {
     using namespace gbe;
     GenProgram *program = GBE_NEW_NO_ARG(GenProgram);
     std::string error;
     // Try to compile the program
-    if (program->buildFromLLVMFile(fileName, error) == false) {
+    if (program->buildFromLLVMFile(fileName, error, optLevel) == false) {
       if (err != NULL && errSize != NULL && stringSize > 0u) {
         const size_t msgSize = std::min(error.size(), stringSize-1u);
         std::memcpy(err, error.c_str(), msgSize);
@@ -157,12 +175,6 @@ namespace gbe {
 void genSetupCallBacks(void)
 {
   gbe_program_new_from_binary = gbe::genProgramNewFromBinary;
+  gbe_program_serialize_to_binary = gbe::genProgramSerializeToBinary;
   gbe_program_new_from_llvm = gbe::genProgramNewFromLLVM;
 }
-
-sem_t llvm_semaphore;
-
-void genSetupLLVMSemaphore(void)
-{
-  sem_init(&llvm_semaphore, 0, 1);
-}
diff --git a/backend/src/backend/gen_program.h b/backend/src/backend/gen_program.h
index a498a5d..8d37a70 100644
--- a/backend/src/backend/gen_program.h
+++ b/backend/src/backend/gen_program.h
@@ -33,8 +33,6 @@
 
 /*! This will make the compiler output Gen ISA code */
 extern void genSetupCallBacks(void);
-extern sem_t llvm_semaphore;
-extern void genSetupLLVMSemaphore(void);
 
 #endif /* __GBE_GEN_PROGRAM_H__ */
 
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 30f9e38..2aafdb1 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -30,6 +30,8 @@
 #include "sys/exception.hpp"
 #include <algorithm>
 #include <climits>
+#include <iostream>
+#include <iomanip>
 
 #define RESERVED_REG_NUM_FOR_SPILL 6
 
@@ -41,6 +43,55 @@ namespace gbe
 
   /*! Provides the location of a register in a vector */
   typedef std::pair<SelectionVector*, uint32_t> VectorLocation;
+  /*! Interval as used in linear scan allocator. Basically, stores the first and
+   *  the last instruction where the register is alive
+   */
+  struct GenRegInterval {
+    INLINE GenRegInterval(ir::Register reg) :
+      reg(reg), minID(INT_MAX), maxID(-INT_MAX) {}
+    ir::Register reg;     //!< (virtual) register of the interval
+    int32_t minID, maxID; //!< Starting and ending points
+  };
+
+  typedef struct GenRegIntervalKey {
+    GenRegIntervalKey(uint16_t reg, uint16_t maxID) {
+      if (maxID == INT_MAX)
+        maxID = 0xFFFF;
+      GBE_ASSERT(reg <= 0xFFFF && maxID <= 0xFFFF);
+      key = (maxID << 16) | reg;
+    }
+    const ir::Register getReg() const {
+      return (ir::Register)(key & 0xFFFF);
+    }
+    const uint16_t getMaxID() const {
+      return key >> 16;
+    }
+    uint32_t key;
+  } GenRegIntervalKey;
+
+  struct spillCmp {
+    bool operator () (const GenRegIntervalKey &lhs, const GenRegIntervalKey &rhs) const
+    { return lhs.key > rhs.key; }
+  };
+
+  typedef set <GenRegIntervalKey, spillCmp> SpillSet;
+
+  class SpillCandidateSet : public SpillSet
+  {
+  public:
+    std::set<GenRegIntervalKey, spillCmp>::iterator find(GenRegInterval interval) {
+      GenRegIntervalKey key(interval.reg, interval.maxID);
+      return SpillSet::find(key);
+    }
+    void insert(GenRegInterval interval) {
+      GenRegIntervalKey key(interval.reg, interval.maxID);
+      SpillSet::insert(key);
+    }
+    void erase(GenRegInterval interval) {
+      GenRegIntervalKey key(interval.reg, interval.maxID);
+      SpillSet::erase(key);
+    }
+  };
 
   /*! Implements the register allocation */
   class GenRegAllocator::Opaque
@@ -56,6 +107,20 @@ namespace gbe
     GenRegister genReg(const GenRegister &reg);
     /*! Output the register allocation */
     void outputAllocation(void);
+    INLINE void getRegAttrib(ir::Register reg, uint32_t &regSize, ir::RegisterFamily *regFamily = NULL) const {
+      // Note that byte vector registers use two bytes per byte (and can be
+      // interleaved)
+      static const size_t familyVectorSize[] = {2,2,2,4,8};
+      static const size_t familyScalarSize[] = {2,1,2,4,8};
+      using namespace ir;
+      const bool isScalar = ctx.sel->isScalarOrBool(reg);
+      const RegisterData regData = ctx.sel->getRegisterData(reg);
+      const RegisterFamily family = regData.family;
+      const uint32_t typeSize = isScalar ? familyScalarSize[family] : familyVectorSize[family];
+      regSize = isScalar ? typeSize : ctx.getSimdWidth() * typeSize;
+      if (regFamily != NULL)
+        *regFamily = family;
+    }
   private:
     /*! Expire one GRF interval. Return true if one was successfully expired */
     bool expireGRF(const GenRegInterval &limit);
@@ -84,6 +149,8 @@ namespace gbe
     GenContext &ctx;
     /*! Map virtual registers to offset in the (physical) register file */
     map<ir::Register, uint32_t> RA;
+    /*! Map offset to virtual registers. */
+    map<uint32_t, ir::Register> offsetReg;
     /*! Provides the position of each register in a vector */
     map<ir::Register, VectorLocation> vectorMap;
     /*! All vectors used in the selection */
@@ -97,29 +164,25 @@ namespace gbe
     /*! Intervals sorting based on ending point positions */
     vector<GenRegInterval*> ending;
     /*! registers that are spilled */
-    set<ir::Register> spilled;
+    SpilledRegs spilledRegs;
+    /*! register which could be spilled.*/
+    SpillCandidateSet spillCandidate;
     /* reserved registers for register spill/reload */
     uint32_t reservedReg;
     /*! Current vector to expire */
     uint32_t expiringID;
+    INLINE void insertNewReg(ir::Register reg, uint32_t grfOffset, bool isVector = false);
+    INLINE bool expireReg(ir::Register reg);
+    INLINE bool spillAtInterval(GenRegInterval interval, int size, uint32_t alignment);
+    INLINE uint32_t allocateReg(GenRegInterval interval, uint32_t size, uint32_t alignment);
+    INLINE bool spillReg(GenRegInterval interval, bool isAllocated = false);
+    INLINE bool spillReg(ir::Register reg, bool isAllocated = false);
+    INLINE bool vectorCanSpill(SelectionVector *vector);
+
     /*! Use custom allocator */
     GBE_CLASS(Opaque);
   };
 
-  // Note that byte vector registers use two bytes per byte (and can be
-  // interleaved)
-  static const size_t familyVectorSize[] = {2,2,2,4,8};
-  static const size_t familyScalarSize[] = {2,1,2,4,8};
-
-  /*! Interval as used in linear scan allocator. Basically, stores the first and
-   *  the last instruction where the register is alive
-   */
-  struct GenRegInterval {
-    INLINE GenRegInterval(ir::Register reg) :
-      reg(reg), minID(INT_MAX), maxID(-INT_MAX) {}
-    ir::Register reg;     //!< (virtual) register of the interval
-    int32_t minID, maxID; //!< Starting and ending points
-  };
 
   GenRegAllocator::Opaque::Opaque(GenContext &ctx) : ctx(ctx) {}
   GenRegAllocator::Opaque::~Opaque(void) {}
@@ -140,8 +203,7 @@ namespace gbe
   INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) {
     using namespace ir;
     for(auto &it : this->ctx.curbeRegs)
-      if (it.first.value() < 0x8000)
-        allocatePayloadReg(it.first, it.second);
+      allocatePayloadReg(it.first, it.second);
 
     // Allocate all pushed registers (i.e. structure kernel arguments)
     const Function &fn = ctx.getFunction();
@@ -163,22 +225,19 @@ namespace gbe
   bool GenRegAllocator::Opaque::createGenReg(const GenRegInterval &interval) {
     using namespace ir;
     const ir::Register reg = interval.reg;
-    const uint32_t simdWidth = ctx.getSimdWidth();
     if (RA.contains(reg) == true)
       return true; // already allocated
     GBE_ASSERT(ctx.isScalarReg(reg) == false);
-    const bool isScalar = ctx.sel->isScalarOrBool(reg);
-    const RegisterData regData = ctx.sel->getRegisterData(reg);
-    const RegisterFamily family = regData.family;
-    const uint32_t typeSize = isScalar ? familyScalarSize[family] : familyVectorSize[family];
-    const uint32_t regSize = isScalar ? typeSize : simdWidth*typeSize;
-    uint32_t grfOffset;
-    while ((grfOffset = ctx.allocate(regSize, regSize)) == 0) {
-      const bool success = this->expireGRF(interval);
-      if (UNLIKELY(success == false)) return false;
+    uint32_t regSize;
+    ir::RegisterFamily family;
+    getRegAttrib(reg, regSize, &family);
+    uint32_t grfOffset = allocateReg(interval, regSize, regSize);
+    if (grfOffset == 0) {
+      /* this register is going to be spilled. */
+      GBE_ASSERT(!(reservedReg && family != ir::FAMILY_DWORD));
+      return false;
     }
-    GBE_ASSERTM(grfOffset != 0, "Unable to register allocate");
-    RA.insert(std::make_pair(reg, grfOffset));
+    insertNewReg(reg, grfOffset);
     return true;
   }
 
@@ -276,6 +335,7 @@ namespace gbe
   }
 
   bool GenRegAllocator::Opaque::expireGRF(const GenRegInterval &limit) {
+    bool ret = false;
     while (this->expiringID != ending.size()) {
       const GenRegInterval *toExpire = this->ending[this->expiringID];
       const ir::Register reg = toExpire->reg;
@@ -287,34 +347,26 @@ namespace gbe
       }
 
       //ignore register that already spilled
-      if(spilled.contains(reg)) {
+      if(spilledRegs.find(reg) != spilledRegs.end()) {
         this->expiringID++;
         continue;
       }
       // Ignore booleans that were allocated with flags
-      // if (ctx.getRegisterFamily(reg) == ir::FAMILY_BOOL && !grfBooleans.contains(reg)) {
-      if (ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL) {
+      if (ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL && !grfBooleans.contains(reg)) {
         this->expiringID++;
         continue;
       }
 
       if (toExpire->maxID >= limit.minID)
-        return false;
-      auto it = RA.find(reg);
-      GBE_ASSERT(it != RA.end());
-      // offset less than 32 means it is not managed by our reg allocator.
-      if (it->second < 32) {
-        this->expiringID++;
-        continue;
-      }
-      // Case 1 - it does not belong to a vector. Just remove it
-        ctx.deallocate(it->second);
-        this->expiringID++;
-        return true;
+        break;
+
+      if (expireReg(reg))
+        ret = true;
+      this->expiringID++;
     }
 
     // We were not able to expire anything
-    return false;
+    return ret;
   }
 
   void GenRegAllocator::Opaque::allocateFlags(Selection &selection) {
@@ -384,9 +436,10 @@ namespace gbe
             grfBooleans.insert(spill.reg);
             spill = interval;
           }
-          // We will a grf for the current register
-          else
+          // We will use a grf for the current register
+          else {
             grfBooleans.insert(reg);
+          }
         }
         else
           allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
@@ -481,42 +534,30 @@ namespace gbe
       if (RA.contains(reg))
         continue; // already allocated
 
+      if (ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL && !grfBooleans.contains(reg))
+        continue;
+
       // Case 1: the register belongs to a vector, allocate all the registers in
       // one piece
       auto it = vectorMap.find(reg);
       if (it != vectorMap.end()) {
         const SelectionVector *vector = it->second.first;
         // all the reg in the SelectionVector are spilled
-        if(spilled.contains(vector->reg[0].reg()))
+        if(spilledRegs.find(vector->reg[0].reg())
+           != spilledRegs.end())
           continue;
-        const uint32_t simdWidth = ctx.getSimdWidth();
-
-        const ir::RegisterData regData = ctx.sel->getRegisterData(reg);
-        const ir::RegisterFamily family = regData.family;
-        const uint32_t typeSize = familyVectorSize[family];
-        const uint32_t alignment = simdWidth*typeSize;
 
+        uint32_t alignment;
+        ir::RegisterFamily family;
+        getRegAttrib(reg, alignment, &family);
         const uint32_t size = vector->regNum * alignment;
-
-        uint32_t grfOffset;
-        while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
-          const bool success = this->expireGRF(interval);
-          if (success == false) {
-            // if no spill support, just return false, else simply spill the register
-            if(reservedReg == 0) return false;
-            break;
-          }
-        }
+        const uint32_t grfOffset = allocateReg(interval, size, alignment);
         if(grfOffset == 0) {
-          // spill all the registers in the SelectionVector
-          // the tricky here is I need to use reservedReg+1 as scratch write payload.
-          // so, i need to write the first register to scratch memory first.
-          // the spillReg() will just append scratch write insn after the def. To spill
-          // the first register, need to call spillReg() last for the vector->reg[0]
+          GBE_ASSERT(!(reservedReg && family != ir::FAMILY_DWORD));
           GBE_ASSERT(vector->regNum < RESERVED_REG_NUM_FOR_SPILL);
           for(int i = vector->regNum-1; i >= 0; i--) {
-            spilled.insert(vector->reg[i].reg());
-            selection.spillReg(vector->reg[i].reg(), reservedReg);
+            if (!spillReg(vector->reg[i].reg()))
+              return false;
           }
           continue;
         }
@@ -524,20 +565,181 @@ namespace gbe
           const ir::Register reg = vector->reg[regID].reg();
           GBE_ASSERT(RA.contains(reg) == false
                      && ctx.sel->getRegisterData(reg).family == family);
-          RA.insert(std::make_pair(reg, grfOffset + alignment * regID));
+          insertNewReg(reg, grfOffset + alignment * regID, true);
           ctx.splitBlock(grfOffset, alignment * regID);  //splitBlock will not split if regID == 0
         }
       }
       // Case 2: This is a regular scalar register, allocate it alone
       else if (this->createGenReg(interval) == false) {
-        if(reservedReg == 0) return false;
-        spilled.insert(reg);
-        selection.spillReg(reg, reservedReg);
+        if (!spillReg(interval))
+          return false;
+      }
+    }
+    if (!spilledRegs.empty()) {
+      GBE_ASSERT(reservedReg != 0);
+      bool success = selection.spillRegs(spilledRegs, reservedReg);
+      if (!success) {
+        std::cerr << "Fail to spill registers." << std::endl;
+        return false;
       }
     }
     return true;
   }
 
+  INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg)
+  {
+    auto it = RA.find(reg);
+    GBE_ASSERT(it != RA.end());
+    // offset less than 32 means it is not managed by our reg allocator.
+    if (it->second < 32)
+      return false;
+
+    ctx.deallocate(it->second);
+    if (reservedReg != 0
+        && (spillCandidate.find(intervals[reg]) != spillCandidate.end())) {
+        spillCandidate.erase(intervals[reg]);
+        /* offset --> reg map should keep updated. */
+        offsetReg.erase(it->second);
+    }
+
+    return true;
+  }
+
+  // insert a new register with allocated offset,
+  // put it to the RA map and the spill map if it could be spilled.
+  INLINE void GenRegAllocator::Opaque::insertNewReg(ir::Register reg, uint32_t grfOffset, bool isVector)
+  {
+     RA.insert(std::make_pair(reg, grfOffset));
+
+     if (reservedReg != 0) {
+
+       uint32_t regSize;
+       ir::RegisterFamily family;
+       getRegAttrib(reg, regSize, &family);
+
+       if (regSize == GEN_REG_SIZE && family == ir::FAMILY_DWORD /*&& !isVector*/) {
+         GBE_ASSERT(offsetReg.find(grfOffset) == offsetReg.end());
+         offsetReg.insert(std::make_pair(grfOffset, reg));
+         spillCandidate.insert(intervals[reg]);
+       }
+     }
+  }
+
+  INLINE bool GenRegAllocator::Opaque::spillReg(ir::Register reg,
+                                                bool isAllocated) {
+    return spillReg(intervals[reg], isAllocated);
+  }
+
+  INLINE bool GenRegAllocator::Opaque::spillReg(GenRegInterval interval,
+                                                bool isAllocated) {
+    if (reservedReg == 0)
+      return false;
+    SpillRegTag spillTag;
+    spillTag.isTmpReg = interval.maxID == interval.minID;
+    if (!spillTag.isTmpReg) {
+      // FIXME, we can optimize scratch allocation according to
+      // the interval information.
+      spillTag.addr = ctx.allocateScratchMem(typeSize(GEN_TYPE_D)
+                                             * ctx.getSimdWidth());
+    } else
+      spillTag.addr = -1;
+    if (isAllocated) {
+      // If this register is allocated, we need to expire it and erase it
+      // from the RA map.
+      bool success = expireReg(interval.reg);
+      GBE_ASSERT(success);
+      RA.erase(interval.reg);
+    }
+    spilledRegs.insert(std::make_pair(interval.reg, spillTag));
+    return true;
+  }
+
+  INLINE bool GenRegAllocator::Opaque::vectorCanSpill(SelectionVector *vector) {
+    for(uint32_t id = 0; id < vector->regNum; id++)
+      if (spillCandidate.find(intervals[(ir::Register)(vector->reg[id]).value.reg])
+          == spillCandidate.end())
+        return false;
+    return true;
+  }
+
+  INLINE bool GenRegAllocator::Opaque::spillAtInterval(GenRegInterval interval,
+                                                       int size,
+                                                       uint32_t alignment) {
+    if (reservedReg == 0)
+      return false;
+    auto it = spillCandidate.begin();
+    // If there is no spill candidate or current register is spillable and current register's
+    // endpoint is after all the spillCandidate register's endpoint we return false. The
+    // caller will spill current register.
+    if (it == spillCandidate.end()
+        || (it->getMaxID() <= interval.maxID && alignment == GEN_REG_SIZE))
+      return false;
+
+    ir::Register reg = it->getReg();
+    set<ir::Register> spillSet;
+    int32_t savedSize = size;
+    while(size > 0) {
+      auto vectorIt = vectorMap.find(reg);
+      bool isVector = vectorIt != vectorMap.end();
+      bool needRestart = false;
+      if (isVector
+          && (vectorCanSpill(vectorIt->second.first))) {
+        const SelectionVector *vector = vectorIt->second.first;
+        for (uint32_t id = 0; id < vector->regNum; id++) {
+          GBE_ASSERT(spilledRegs.find(vector->reg[id].reg())
+                     == spilledRegs.end());
+          spillSet.insert(vector->reg[id].reg());
+          reg = vector->reg[id].reg();
+          size -= GEN_REG_SIZE;
+        }
+      } else if (!isVector) {
+        spillSet.insert(reg);
+        size -= GEN_REG_SIZE;
+      } else
+        needRestart = true; // is a vector which could not be spilled.
+
+      if (size <= 0)
+        break;
+      if (!needRestart) {
+        uint32_t offset = RA.find(reg)->second;
+        auto nextRegIt = offsetReg.find(offset + GEN_REG_SIZE);
+        if (nextRegIt != offsetReg.end())
+          reg = nextRegIt->second;
+        else
+          needRestart = true;
+      }
+
+      if (needRestart) {
+        // next register is not in spill candidate.
+        // let's move to next candidate and start over.
+        it++;
+        if (it == spillCandidate.end())
+          return false;
+        reg = it->getReg();
+        size = savedSize;
+        spillSet.clear();
+      }
+    }
+
+    for(auto spillreg : spillSet)
+      spillReg(spillreg, true);
+    return true;
+  }
+
+  INLINE uint32_t GenRegAllocator::Opaque::allocateReg(GenRegInterval interval,
+                                                       uint32_t size,
+                                                       uint32_t alignment) {
+    uint32_t grfOffset;
+    while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
+      const bool success = this->expireGRF(interval);
+      if (success == false) {
+        if (spillAtInterval(interval, size, alignment) == false)
+          return 0;
+      }
+    }
+    return grfOffset;
+  }
+
   INLINE bool GenRegAllocator::Opaque::allocate(Selection &selection) {
     using namespace ir;
     if (ctx.getSimdWidth() == 8) {
@@ -569,6 +771,7 @@ namespace gbe
     int32_t insnID = 0;
     for (auto &block : *selection.blockList) {
       int32_t lastID = insnID;
+      int32_t firstID = insnID;
       // Update the intervals of each used register. Note that we do not
       // register allocate R0, so we skip all sub-registers in r0
       for (auto &insn : block.insnList) {
@@ -619,16 +822,29 @@ namespace gbe
         insnID++;
       }
 
+      // All registers alive at the begining of the block must update their intervals.
+      const ir::BasicBlock *bb = block.bb;
+      for (auto reg : ctx.getLiveIn(bb))
+          this->intervals[reg].minID = std::min(this->intervals[reg].minID, firstID);
+
+      for (auto reg : ctx.getExtraLiveIn(bb))
+          this->intervals[reg].minID = std::min(this->intervals[reg].minID, firstID);
       // All registers alive at the end of the block must have their intervals
       // updated as well
-      const ir::BasicBlock *bb = block.bb;
-      const ir::Liveness::LiveOut &liveOut = ctx.getLiveOut(bb);
-      for (auto reg : liveOut) {
-        this->intervals[reg].minID = std::min(this->intervals[reg].minID, lastID);
+      for (auto reg : ctx.getLiveOut(bb))
+        this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, lastID);
+
+      for (auto reg : ctx.getExtraLiveOut(bb))
         this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, lastID);
-      }
     }
 
+    this->intervals[ocl::emask].minID = 0;
+    this->intervals[ocl::emask].maxID = INT_MAX;
+    this->intervals[ocl::notemask].minID = 0;
+    this->intervals[ocl::notemask].maxID = INT_MAX;
+    this->intervals[ocl::retVal].minID = INT_MAX;
+    this->intervals[ocl::retVal].maxID = -INT_MAX;
+
     // Sort both intervals in starting point and ending point increasing orders
     const uint32_t regNum = ctx.sel->getRegNum();
     this->starting.resize(regNum);
@@ -657,19 +873,40 @@ namespace gbe
   }
 
   INLINE void GenRegAllocator::Opaque::outputAllocation(void) {
-    std::cout << "## register allocation ##" << std::endl;
+    using namespace std;
+    cout << "## register allocation ##" << endl;
     for(auto &i : RA) {
-        int vReg = (int)i.first;
-        int offst = (int)i.second / sizeof(float);
-        int reg = offst / 8;
-        int subreg = offst % 8;
-        std::cout << "%" << vReg << " g" << reg << "." << subreg << "D" << std::endl;
+        ir::Register vReg = (ir::Register)i.first;
+        ir::RegisterFamily family;
+        uint32_t regSize;
+        getRegAttrib(vReg, regSize, &family);
+        int offst = (int)i.second;// / sizeof(float);
+        int reg = offst / 32;
+        int subreg = (offst % 32) / regSize;
+        cout << "%" << setiosflags(ios::left) << setw(8) << vReg
+             << "g" << setiosflags(ios::left) << setw(3) << reg << "."
+             << setiosflags(ios::left) << setw(3) << subreg << ir::getFamilyName(family)
+             << "  " << setw(-3) << regSize  << "B\t"
+             << "[  " << setw(8) << this->intervals[(uint)vReg].minID
+             << " -> " << setw(8) << this->intervals[(uint)vReg].maxID
+             << "]" << endl;
+    }
+    if (!spilledRegs.empty())
+      cout << "## spilled registers: " << spilledRegs.size() << endl;
+    for(auto it = spilledRegs.begin(); it != spilledRegs.end(); it++) {
+      ir::Register vReg = it->first;
+      ir::RegisterFamily family;
+      uint32_t regSize;
+      getRegAttrib(vReg, regSize, &family);
+      cout << "%" << setiosflags(ios::left) << setw(8) << vReg
+           << "@" << setw(8) << it->second.addr
+           << "  " << ir::getFamilyName(family)
+           <<  "  " << setw(-3) << regSize << "B\t"
+           << "[  " << setw(8) << this->intervals[(uint)vReg].minID
+           << " -> " << setw(8) << this->intervals[(uint)vReg].maxID
+           << "]" << endl;
     }
-    std::set<ir::Register>::iterator is;
-    std::cout << "## spilled registers:" << std::endl;
-    for(is = spilled.begin(); is != spilled.end(); is++)
-      std::cout << (int)*is << std::endl;
-    std::cout << std::endl;
+    cout << endl;
   }
 
   INLINE GenRegister setGenReg(const GenRegister &src, uint32_t grfOffset) {
diff --git a/backend/src/backend/gen_reg_allocation.hpp b/backend/src/backend/gen_reg_allocation.hpp
index 5541304..84b0f9c 100644
--- a/backend/src/backend/gen_reg_allocation.hpp
+++ b/backend/src/backend/gen_reg_allocation.hpp
@@ -35,6 +35,13 @@ namespace gbe
   struct GenRegInterval; // Liveness interval for each register
   class GenContext;     // Gen specific context
 
+  typedef struct SpillRegTag {
+    bool isTmpReg;
+    int32_t addr;
+  } SpillRegTag;
+
+  typedef map<ir::Register, SpillRegTag> SpilledRegs;
+
   /*! Register allocate (i.e. virtual to physical register mapping) */
   class GenRegAllocator
   {
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
index 538f16c..8794318 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -185,6 +185,7 @@ namespace gbe
       this->type = type;
       this->file = file;
       this->physical = 0;
+      this->subphysical = 0;
       this->value.reg = reg;
       this->negation = 0;
       this->absolute = 0;
@@ -209,6 +210,7 @@ namespace gbe
       this->file = file;
       this->nr = nr;
       this->physical = 1;
+      this->subphysical = 1;
       this->subnr = subnr * typeSize(type);
       this->negation = 0;
       this->absolute = 0;
@@ -268,15 +270,17 @@ namespace gbe
     INLINE GenRegister top_half(void) const {
       GenRegister r = bottom_half();
       r.subnr += 4;
+      r.nr += r.subnr / 32;
+      r.subnr %= 32;
       return r;
     }
 
     INLINE GenRegister bottom_half(void) const {
       GBE_ASSERT(isint64());
-      GenRegister r = *this;
+      GenRegister r = h2(*this);
       r.type = type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D;
-      r.hstride = GEN_HORIZONTAL_STRIDE_2;
-      r.vstride = GEN_VERTICAL_STRIDE_16;
+      if(r.vstride != GEN_VERTICAL_STRIDE_0)
+       r.vstride = GEN_VERTICAL_STRIDE_16;
       return r;
     }
 
@@ -293,6 +297,8 @@ namespace gbe
     }
 
     INLINE int flag_nr(void) const {
+      assert(file == GEN_ARCHITECTURE_REGISTER_FILE);
+      assert(nr >= GEN_ARF_FLAG && nr < GEN_ARF_FLAG + 2);
       return nr & 15;
     }
 
@@ -302,7 +308,8 @@ namespace gbe
 
     static INLINE GenRegister h2(GenRegister reg) {
       GenRegister r = reg;
-      r.hstride = GEN_HORIZONTAL_STRIDE_2;
+      if(r.hstride != GEN_HORIZONTAL_STRIDE_0)
+        r.hstride = GEN_HORIZONTAL_STRIDE_2;
       return r;
     }
 
@@ -756,7 +763,7 @@ namespace gbe
 
     static INLINE GenRegister suboffset(GenRegister reg, uint32_t delta) {
       if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
-        reg.subnr += delta * typeSize(reg.type);
+        reg.subnr += delta * typeSize(reg.type) * hstride_size(reg);
         reg.nr += reg.subnr / 32;
         reg.subnr %= 32;
       }
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 937f95b..2492a8b 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -32,6 +32,8 @@
 #include "ir/unit.hpp"
 #include "llvm/llvm_to_gen.hpp"
 #include "llvm/Config/config.h"
+#include "llvm/Support/Threading.h"
+#include "llvm/Support/ManagedStatic.h"
 #include <cstring>
 #include <algorithm>
 #include <fstream>
@@ -39,6 +41,7 @@
 #include <sstream>
 #include <iostream>
 #include <unistd.h>
+#include <mutex>
 
 /* Not defined for LLVM 3.0 */
 #if !defined(LLVM_VERSION_MAJOR)
@@ -99,9 +102,9 @@ namespace gbe {
 
   BVAR(OCL_OUTPUT_GEN_IR, false);
 
-  bool Program::buildFromLLVMFile(const char *fileName, std::string &error) {
+  bool Program::buildFromLLVMFile(const char *fileName, std::string &error, int optLevel) {
     ir::Unit unit;
-    if (llvmToGen(unit, fileName) == false) {
+    if (llvmToGen(unit, fileName, optLevel) == false) {
       error = std::string(fileName) + " not found";
       return false;
     }
@@ -120,6 +123,7 @@ namespace gbe {
       Kernel *kernel = this->compileKernel(unit, name);
       kernel->setSamplerSet(pair.second->getSamplerSet());
       kernel->setImageSet(pair.second->getImageSet());
+      kernel->setCompileWorkGroupSize(pair.second->getCompileWorkGroupSize());
       kernels.insert(std::make_pair(name, kernel));
     }
     return true;
@@ -227,6 +231,7 @@ namespace gbe {
       KernelArgument& arg = args[i];
       OUT_UPDATE_SZ(arg.type);
       OUT_UPDATE_SZ(arg.size);
+      OUT_UPDATE_SZ(arg.align);
       OUT_UPDATE_SZ(arg.bufSize);
     }
 
@@ -247,7 +252,9 @@ namespace gbe {
     OUT_UPDATE_SZ(scratchSize);
     OUT_UPDATE_SZ(useSLM);
     OUT_UPDATE_SZ(slmSize);
-
+    OUT_UPDATE_SZ(compileWgSize[0]);
+    OUT_UPDATE_SZ(compileWgSize[1]);
+    OUT_UPDATE_SZ(compileWgSize[2]);
     /* samplers. */
     if (samplerSet) {
       has_samplerset = 1;
@@ -313,6 +320,7 @@ namespace gbe {
       KernelArgument& arg = args[i];
       IN_UPDATE_SZ(arg.type);
       IN_UPDATE_SZ(arg.size);
+      IN_UPDATE_SZ(arg.align);
       IN_UPDATE_SZ(arg.bufSize);
     }
 
@@ -336,6 +344,9 @@ namespace gbe {
     IN_UPDATE_SZ(scratchSize);
     IN_UPDATE_SZ(useSLM);
     IN_UPDATE_SZ(slmSize);
+    IN_UPDATE_SZ(compileWgSize[0]);
+    IN_UPDATE_SZ(compileWgSize[1]);
+    IN_UPDATE_SZ(compileWgSize[2]);
 
     IN_UPDATE_SZ(has_samplerset);
     if (has_samplerset) {
@@ -413,6 +424,7 @@ namespace gbe {
     outs << spaces_nl << "  scratchSize: " << scratchSize << "\n";
     outs << spaces_nl << "  useSLM: " << useSLM << "\n";
     outs << spaces_nl << "  slmSize: " << slmSize << "\n";
+    outs << spaces_nl << "  compileWgSize: " << compileWgSize[0] << compileWgSize[1] << compileWgSize[2] << "\n";
 
     outs << spaces_nl << "  Argument Number is " << argNum << "\n";
     for (uint32_t i = 0; i < argNum; i++) {
@@ -420,6 +432,7 @@ namespace gbe {
       outs << spaces_nl << "  Arg " << i << ":\n";
       outs << spaces_nl << "      type value: "<< arg.type << "\n";
       outs << spaces_nl << "      size: "<< arg.size << "\n";
+      outs << spaces_nl << "      align: "<< arg.align << "\n";
       outs << spaces_nl << "      bufSize: "<< arg.bufSize << "\n";
     }
 
@@ -451,28 +464,36 @@ namespace gbe {
     GBE_SAFE_DELETE(program);
   }
 
-  static void buildModuleFromSource(const char* input, const char* output, std::string options) {
+  BVAR(OCL_OUTPUT_BUILD_LOG, false);
+  SVAR(OCL_PCH_PATH, PCH_OBJECT_DIR);
+  SVAR(OCL_PCM_PATH, PCM_OBJECT_DIR);
+
+  static bool buildModuleFromSource(const char* input, const char* output, std::string options,
+                                    size_t stringSize, char *err, size_t *errSize) {
     // Arguments to pass to the clang frontend
     vector<const char *> args;
-    bool bOpt = true;
     bool bFastMath = false;
 
     vector<std::string> useless; //hold substrings to avoid c_str free
     size_t start = 0, end = 0;
-    /* clang unsupport options:
+    /* FIXME
+       clang unsupport options:
        -cl-denorms-are-zero, -cl-strict-aliasing
        -cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt
        all support options, refer to clang/include/clang/Driver/Options.inc
-       Maybe can filter these options to avoid warning
     */
+    //Handle -cl-opt-disable in llvmToGen, skip here
+    const std::string unsupportedOptions("-cl-denorms-are-zero, -cl-strict-aliasing, -cl-opt-disable,"
+                                         "-cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt");
     while (end != std::string::npos) {
       end = options.find(' ', start);
       std::string str = options.substr(start, end - start);
       start = end + 1;
       if(str.size() == 0)
         continue;
-      if(str == "-cl-opt-disable") bOpt = false;
       if(str == "-cl-fast-relaxed-math") bFastMath = true;
+      if(unsupportedOptions.find(str) != std::string::npos)
+        continue;
       useless.push_back(str);
       args.push_back(str.c_str());
     }
@@ -482,11 +503,10 @@ namespace gbe {
     args.push_back("-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND");
 #endif
     args.push_back("-emit-llvm");
-    // XXX we haven't implement those builtin functions,
+    // FIXME we haven't implement those builtin functions,
     // so disable it currently.
     args.push_back("-fno-builtin");
-    if(bOpt)
-      args.push_back("-O2");
+    args.push_back("-disable-llvm-optzns");
     if(bFastMath)
       args.push_back("-D __FAST_RELAXED_MATH__=1");
 #if LLVM_VERSION_MINOR <= 2
@@ -501,24 +521,26 @@ namespace gbe {
     args.push_back(input);
 
     // The compiler invocation needs a DiagnosticsEngine so it can report problems
+    std::string ErrorString;
+    llvm::raw_string_ostream ErrorInfo(ErrorString);
+    llvm::IntrusiveRefCntPtr<clang::DiagnosticOptions> DiagOpts = new clang::DiagnosticOptions();
+    DiagOpts->ShowCarets = false;
 #if LLVM_VERSION_MINOR <= 1
     args.push_back("-triple");
     args.push_back("ptx32");
 
     clang::TextDiagnosticPrinter *DiagClient =
-                             new clang::TextDiagnosticPrinter(llvm::errs(), clang::DiagnosticOptions());
+                             new clang::TextDiagnosticPrinter(ErrorInfo, *DiagOpts)
     llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> DiagID(new clang::DiagnosticIDs());
     clang::DiagnosticsEngine Diags(DiagID, DiagClient);
 #else
     args.push_back("-ffp-contract=off");
 
-    llvm::IntrusiveRefCntPtr<clang::DiagnosticOptions> DiagOpts = new clang::DiagnosticOptions();
     clang::TextDiagnosticPrinter *DiagClient =
-                             new clang::TextDiagnosticPrinter(llvm::errs(), &*DiagOpts);
+                             new clang::TextDiagnosticPrinter(ErrorInfo, &*DiagOpts);
     llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> DiagID(new clang::DiagnosticIDs());
     clang::DiagnosticsEngine Diags(DiagID, &*DiagOpts, DiagClient);
 #endif /* LLVM_VERSION_MINOR <= 1 */
-
     // Create the compiler invocation
     llvm::OwningPtr<clang::CompilerInvocation> CI(new clang::CompilerInvocation);
     clang::CompilerInvocation::CreateFromArgs(*CI,
@@ -533,15 +555,20 @@ namespace gbe {
 #if LLVM_VERSION_MINOR <= 2
     Clang.createDiagnostics(args.size(), &args[0]);
 #else
-    Clang.createDiagnostics();
+    Clang.createDiagnostics(DiagClient, false);
 #endif /* LLVM_VERSION_MINOR <= 2 */
+
+    Clang.getDiagnosticOpts().ShowCarets = false;
     if (!Clang.hasDiagnostics())
-      return;
+      return false;
 
     // Set Language
     clang::LangOptions & lang_opts = Clang.getLangOpts();
     lang_opts.OpenCL = 1;
 
+    clang::PreprocessorOptions& prep_opt = Clang.getPreprocessorOpts();
+    prep_opt.DisablePCHValidation = 1;
+
     //llvm flags need command line parsing to take effect
     if (!Clang.getFrontendOpts().LLVMArgs.empty()) {
       unsigned NumArgs = Clang.getFrontendOpts().LLVMArgs.size();
@@ -557,26 +584,63 @@ namespace gbe {
 
     // Create an action and make the compiler instance carry it out
     llvm::OwningPtr<clang::CodeGenAction> Act(new clang::EmitLLVMOnlyAction());
-    sem_wait(&llvm_semaphore);
+
+    std::string dirs = OCL_PCM_PATH;
+    std::string pcmFileName;
+    std::istringstream idirs(dirs);
+    bool findPcm = false;
+
+    while (getline(idirs, pcmFileName, ':')) {
+      if(access(pcmFileName.c_str(), R_OK) == 0) {
+        findPcm = true;
+        break;
+      }
+    }
+
+    GBE_ASSERT(findPcm && "Could not find pre compiled module library.\n");
+
+    Clang.getCodeGenOpts().LinkBitcodeFile = pcmFileName;
     auto retVal = Clang.ExecuteAction(*Act);
-    sem_post(&llvm_semaphore);
+
+    if (err != NULL) {
+      GBE_ASSERT(errSize != NULL);
+      *errSize = ErrorString.copy(err, stringSize - 1, 0);
+    }
+
+    if (err == NULL || OCL_OUTPUT_BUILD_LOG) {
+      // flush the error messages to the errs() if there is no
+      // error string buffer.
+      llvm::errs() << ErrorString;
+    }
+    ErrorString.clear();
     if (!retVal)
-      return;
+      return false;
 
     llvm::Module *module = Act->takeModule();
 
-    std::string ErrorInfo;
 #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR > 3)
     auto mode = llvm::sys::fs::F_Binary;
 #else
     auto mode = llvm::raw_fd_ostream::F_Binary;
 #endif
-    llvm::raw_fd_ostream OS(output, ErrorInfo, mode);
+    llvm::raw_fd_ostream OS(output, ErrorString, mode);
     //still write to temp file for code simply, otherwise need add another function.
     //because gbe_program_new_from_llvm also be used by cl_program_create_from_llvm, can't be removed
     //TODO: Pass module to llvmToGen, if use module, should return Act and use OwningPtr out of this funciton
     llvm::WriteBitcodeToFile(module, OS);
+    if (err != NULL && *errSize < stringSize - 1 && ErrorString.size() > 0) {
+      size_t errLen;
+      errLen = ErrorString.copy(err + *errSize, stringSize - *errSize - 1, 0);
+      *errSize += errLen;
+    }
+
+    if (err == NULL || OCL_OUTPUT_BUILD_LOG) {
+      // flush the error messages to the errs() if there is no
+      // error string buffer.
+      llvm::errs() << ErrorString;
+    }
     OS.close();
+    return true;
   }
 
   extern std::string ocl_stdlib_str;
@@ -588,51 +652,142 @@ namespace gbe {
                                           char *err,
                                           size_t *errSize)
   {
-    char clStr[L_tmpnam+1], llStr[L_tmpnam+1];
-    const std::string clName = std::string(tmpnam_r(clStr)) + ".cl"; /* unsafe! */
-    const std::string llName = std::string(tmpnam_r(llStr)) + ".ll"; /* unsafe! */
-    std::string pchHeaderName;
+    char clStr[] = "/tmp/XXXXXX.cl";
+    char llStr[] = "/tmp/XXXXXX.ll";
+    int clFd = mkstemps(clStr, 3);
+    int llFd = mkstemps(llStr, 3);
+    close(llFd);
+    const std::string clName = std::string(clStr);
+    const std::string llName = std::string(llStr);
     std::string clOpt;
+    int optLevel = 1;
 
-    FILE *clFile = fopen(clName.c_str(), "w");
+    FILE *clFile = fdopen(clFd, "w");
     FATAL_IF(clFile == NULL, "Failed to open temporary file");
 
-    bool usePCH = false;
+    bool usePCH = OCL_USE_PCH;
+    bool findPCH = false;
+
+    /* Because our header file is so big, we want to avoid recompile the header from
+       scratch. We use the PCH support of Clang to save the huge compiling time.
+       We just use the most general build opt to build the PCH header file, so if
+       user pass new build options here, the PCH can not pass the Clang's compitable
+       validating. Clang will do three kinds of compatible check: Language Option,
+       Target Option and Preprocessing Option. Other kinds of options such as the
+       CodeGen options will not affect the AST result, so no need to check.
+
+       According to OpenCL 1.1's spec, the CL build options:
+       -D name=definition
+       If the definition is not used in our header, it is compitable
+
+       -cl-single-precision-constant
+       -cl-denorms-are-zero
+       -cl-std=
+       Language options, really affect.
+
+       -cl-opt-disable
+       -cl-mad-enable
+       -cl-no-signed-zeros
+       -cl-unsafe-math-optimizations
+       -cl-finite-math-only
+       -cl-fast-relaxed-math
+       CodeGen options, not affect
+
+       -Werror
+       -w
+       Our header should not block the compiling because of warning.
+
+       So we just disable the PCH validation of Clang and do the judgement by ourself. */
+
+    if(options) {
+      char *p;
+      /* FIXME: Though we can disable the pch valid check, and load pch successfully,
+         but these language opts and pre-defined macro will still generate the diag msg
+         to the diag engine of the Clang and cause the Clang to report error.
+         We filter them all here to avoid these. */
+      const char * incompatible_opts[] = {
+          "-cl-single-precision-constant",
+//        "-cl-denorms-are-zero",
+          "-cl-fast-relaxed-math",
+          "-cl-std=",
+      };
+      const char * incompatible_defs[] = {
+          "GET_FLOAT_WORD",
+          "__NV_CL_C_VERSION",
+          "GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND"
+      };
+
+      for (unsigned int i = 0; i < sizeof(incompatible_opts)/sizeof(char *); i++ ) {
+        p = strstr(const_cast<char *>(options), incompatible_opts[i]);
+        if (p) {
+          usePCH = false;
+          break;
+        }
+      }
+
+      if (usePCH) {
+        for (unsigned int i = 0; i < sizeof(incompatible_defs)/sizeof(char *); i++ ) {
+          p = strstr(const_cast<char *>(options), incompatible_defs[i]);
+          if (p) {
+            usePCH = false;
+            break;
+          }
+        }
+      }
+
+      p = strstr(const_cast<char *>(options), "-cl-opt-disable");
+      if (p)
+        optLevel = 0;
 
-    if(options)
       clOpt += options;
+    }
 
-    if (options || !OCL_USE_PCH) {
-      /* Some building option may cause the prebuild pch header file
-         not compatible with the XXX.cl source. We need rebuild all here.*/
-      usePCH = false;
-    } else {
-      std::string dirs = PCH_OBJECT_DIR;
-      std::istringstream idirs(dirs);
+    std::string dirs = OCL_PCH_PATH;
+    std::istringstream idirs(dirs);
+    std::string pchFileName;
 
-      while (getline(idirs, pchHeaderName, ';')) {
-        if(access(pchHeaderName.c_str(), R_OK) == 0) {
-          usePCH = true;
-          break;
-        }
+    while (getline(idirs, pchFileName, ':')) {
+      if(access(pchFileName.c_str(), R_OK) == 0) {
+        findPCH = true;
+        break;
       }
     }
-    if (usePCH) {
+
+    if (usePCH && findPCH) {
       clOpt += " -include-pch ";
-      clOpt += pchHeaderName;
+      clOpt += pchFileName;
       clOpt += " ";
     } else
       fwrite(ocl_stdlib_str.c_str(), strlen(ocl_stdlib_str.c_str()), 1, clFile);
+
     // Write the source to the cl file
     fwrite(source, strlen(source), 1, clFile);
     fclose(clFile);
 
-    buildModuleFromSource(clName.c_str(), llName.c_str(), clOpt.c_str());
-    remove(clName.c_str());
-
+    gbe_program p;
+    if (buildModuleFromSource(clName.c_str(), llName.c_str(), clOpt.c_str(),
+                              stringSize, err, errSize)) {
     // Now build the program from llvm
-    gbe_program p = gbe_program_new_from_llvm(llName.c_str(), stringSize, err, errSize);
-    remove(llName.c_str());
+      static std::mutex gbe_mutex;
+      gbe_mutex.lock();
+      size_t clangErrSize = 0;
+      if (err != NULL) {
+        GBE_ASSERT(errSize != NULL);
+        stringSize -= *errSize;
+        err += *errSize;
+        clangErrSize = *errSize;
+      }
+      p = gbe_program_new_from_llvm(llName.c_str(), stringSize,
+                                    err, errSize, optLevel);
+      if (err != NULL)
+        *errSize += clangErrSize;
+      gbe_mutex.unlock();
+      if (OCL_OUTPUT_BUILD_LOG && options)
+        llvm::errs() << options;
+      remove(llName.c_str());
+    } else
+      p = NULL;
+    remove(clName.c_str());
     return p;
   }
 
@@ -696,6 +851,11 @@ namespace gbe {
     return kernel->getArgSize(argID);
   }
 
+  static uint32_t kernelGetArgAlign(gbe_kernel genKernel, uint32_t argID) {
+    if (genKernel == NULL) return 0u;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getArgAlign(argID);
+  }
   static gbe_arg_type kernelGetArgType(gbe_kernel genKernel, uint32_t argID) {
     if (genKernel == NULL) return GBE_ARG_INVALID;
     const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
@@ -762,6 +922,12 @@ namespace gbe {
     kernel->getSamplerData(samplers);
   }
 
+  static void kernelGetCompileWorkGroupSize(gbe_kernel gbeKernel, size_t wg_size[3]) {
+    if (gbeKernel == NULL) return;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    kernel->getCompileWorkGroupSize(wg_size);
+  }
+
   static size_t kernelGetImageSize(gbe_kernel gbeKernel) {
     if (gbeKernel == NULL) return 0;
     const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
@@ -790,6 +956,7 @@ namespace gbe {
 
 GBE_EXPORT_SYMBOL gbe_program_new_from_source_cb *gbe_program_new_from_source = NULL;
 GBE_EXPORT_SYMBOL gbe_program_new_from_binary_cb *gbe_program_new_from_binary = NULL;
+GBE_EXPORT_SYMBOL gbe_program_serialize_to_binary_cb *gbe_program_serialize_to_binary = NULL;
 GBE_EXPORT_SYMBOL gbe_program_new_from_llvm_cb *gbe_program_new_from_llvm = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_size = NULL;
 GBE_EXPORT_SYMBOL gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data = NULL;
@@ -803,6 +970,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_get_code_size_cb *gbe_kernel_get_code_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_arg_num_cb *gbe_kernel_get_arg_num = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_align_cb *gbe_kernel_get_arg_align = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size = NULL;
@@ -814,6 +982,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = NULL;
 GBE_EXPORT_SYMBOL gbe_set_image_base_index_cb *gbe_set_image_base_index = NULL;
@@ -838,6 +1007,7 @@ namespace gbe
       gbe_kernel_get_arg_num = gbe::kernelGetArgNum;
       gbe_kernel_get_arg_size = gbe::kernelGetArgSize;
       gbe_kernel_get_arg_type = gbe::kernelGetArgType;
+      gbe_kernel_get_arg_align = gbe::kernelGetArgAlign;
       gbe_kernel_get_simd_width = gbe::kernelGetSIMDWidth;
       gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
       gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
@@ -849,12 +1019,20 @@ namespace gbe
       gbe_kernel_get_slm_size = gbe::kernelGetSLMSize;
       gbe_kernel_get_sampler_size = gbe::kernelGetSamplerSize;
       gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
+      gbe_kernel_get_compile_wg_size = gbe::kernelGetCompileWorkGroupSize;
       gbe_kernel_get_image_size = gbe::kernelGetImageSize;
       gbe_kernel_get_image_data = gbe::kernelGetImageData;
       gbe_get_image_base_index = gbe::getImageBaseIndex;
       gbe_set_image_base_index = gbe::setImageBaseIndex;
       genSetupCallBacks();
-      genSetupLLVMSemaphore();
+      llvm::llvm_start_multithreaded();
+    }
+
+    ~CallBackInitializer() {
+      llvm::llvm_stop_multithreaded();
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR > 3)
+      llvm::llvm_shutdown();
+#endif
     }
   };
 
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 10fcc49..e6436c3 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -76,7 +76,10 @@ enum gbe_curbe_type {
   GBE_CURBE_KERNEL_ARGUMENT,
   GBE_CURBE_EXTRA_ARGUMENT,
   GBE_CURBE_BLOCK_IP,
-  GBE_CURBE_THREAD_NUM
+  GBE_CURBE_THREAD_NUM,
+  GBE_CURBE_EMASK,
+  GBE_CURBE_NOT_EMASK,
+  GBE_CURBE_BARRIER_MASK,
 };
 
 /*! Extra arguments use the negative range of sub-values */
@@ -122,11 +125,16 @@ extern gbe_program_new_from_source_cb *gbe_program_new_from_source;
 typedef gbe_program (gbe_program_new_from_binary_cb)(const char *binary, size_t size);
 extern gbe_program_new_from_binary_cb *gbe_program_new_from_binary;
 
+/*! Serialize a program to a bin */
+typedef size_t (gbe_program_serialize_to_binary_cb)(gbe_program program, char **binary);
+extern gbe_program_serialize_to_binary_cb *gbe_program_serialize_to_binary;
+
 /*! Create a new program from the given LLVM file */
 typedef gbe_program (gbe_program_new_from_llvm_cb)(const char *fileName,
                                                    size_t string_size,
                                                    char *err,
-                                                   size_t *err_size);
+                                                   size_t *err_size,
+                                                   int optLevel);
 extern gbe_program_new_from_llvm_cb *gbe_program_new_from_llvm;
 
 /*! Get the size of global constants */
@@ -145,6 +153,10 @@ extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size;
 typedef void (gbe_kernel_get_sampler_data_cb)(gbe_kernel gbeKernel, uint32_t *samplers);
 extern gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data;
 
+/*! Get the content of defined samplers */
+typedef void (gbe_kernel_get_compile_wg_size_cb)(gbe_kernel gbeKernel, size_t wg_sz[3]);
+extern gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size;
+
 /*! Destroy and deallocate the given program */
 typedef void (gbe_program_delete_cb)(gbe_program);
 extern gbe_program_delete_cb *gbe_program_delete;
@@ -185,6 +197,10 @@ extern gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size;
 typedef enum gbe_arg_type (gbe_kernel_get_arg_type_cb)(gbe_kernel, uint32_t argID);
 extern gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type;
 
+/*! Get the align of the given argument */
+typedef uint32_t (gbe_kernel_get_arg_align_cb)(gbe_kernel, uint32_t argID);
+extern gbe_kernel_get_arg_align_cb *gbe_kernel_get_arg_align;
+
 /*! Get the simd width for the kernel */
 typedef uint32_t (gbe_kernel_get_simd_width_cb)(gbe_kernel);
 extern gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 9b33b7c..e6fc411 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -47,6 +47,7 @@ namespace gbe {
   struct KernelArgument {
     gbe_arg_type type; //!< Pointer, structure, image, regular value?
     uint32_t size;     //!< Size of the argument
+    uint32_t align;    //!< addr alignment of the argument
     uint32_t bufSize;  //!< Contant buffer size
   };
 
@@ -88,6 +89,9 @@ namespace gbe {
     INLINE uint32_t getArgSize(uint32_t argID) const {
       return argID >= argNum ? 0u : args[argID].size;
     }
+    INLINE uint32_t getArgAlign(uint32_t argID) const {
+      return argID >= argNum ? 0u : args[argID].align;
+    }
     /*! Return the type of the given argument */
     INLINE gbe_arg_type getArgType(uint32_t argID) const {
       return argID >= argNum ? GBE_ARG_INVALID : args[argID].type;
@@ -128,6 +132,18 @@ namespace gbe {
     void setImageSet(ir::ImageSet * from) {
       imageSet = from;
     }
+    /*! Set compile work group size */
+    void setCompileWorkGroupSize(const size_t wg_sz[3]) {
+       compileWgSize[0] = wg_sz[0];
+       compileWgSize[1] = wg_sz[1];
+       compileWgSize[2] = wg_sz[2];
+    }
+    /*! Get compile work group size */
+    void getCompileWorkGroupSize (size_t wg_sz[3]) const {
+       wg_sz[0] = compileWgSize[0];
+       wg_sz[1] = compileWgSize[1];
+       wg_sz[2] = compileWgSize[2];
+    }
     /*! Get defined image size */
     size_t getImageSize(void) const { return imageSet->getDataSize(); }
     /*! Get defined image value array */
@@ -177,6 +193,7 @@ namespace gbe {
     Context *ctx;              //!< Save context after compiler to alloc constant buffer curbe
     ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
     ir::ImageSet *imageSet;    //!< Copy from the corresponding function.
+    size_t compileWgSize[3];   //!< required work group size by kernel attribute.
     GBE_CLASS(Kernel);         //!< Use custom allocators
   };
 
@@ -214,7 +231,7 @@ namespace gbe {
     /*! Build a program from a ir::Unit */
     bool buildFromUnit(const ir::Unit &unit, std::string &error);
     /*! Buils a program from a LLVM source code */
-    bool buildFromLLVMFile(const char *fileName, std::string &error);
+    bool buildFromLLVMFile(const char *fileName, std::string &error, int optLevel);
     /*! Buils a program from a OCL string */
     bool buildFromSource(const char *source, std::string &error);
     /*! Get size of the global constant arrays */
diff --git a/backend/src/builtin_vector_proto.def b/backend/src/builtin_vector_proto.def
index 2b8f913..4393ad5 100644
--- a/backend/src/builtin_vector_proto.def
+++ b/backend/src/builtin_vector_proto.def
@@ -147,20 +147,20 @@ gentype trunc (gentype)
 
 # XXX we already defined all native and non-native
 # functions to the same one.
-#gentype native_cos (gentype x)
-#gentype native_divide (gentype x, gentype y)
-#gentype native_exp (gentype x)
+gentype native_cos (gentype x)
+gentype native_divide (gentype x, gentype y)
+gentype native_exp (gentype x)
 #gentype native_exp2 (gentype x)
-#gentype native_exp10 (gentype x)
-#gentype native_log (gentype x)
-#gentype native_log2 (gentype x)
-#gentype native_log10 (gentype x)
-#gentype native_powr (gentype x, gentype y)
+gentype native_exp10 (gentype x)
+gentype native_log (gentype x)
+gentype native_log2 (gentype x)
+gentype native_log10 (gentype x)
+gentype native_powr (gentype x, gentype y)
 gentype native_recip (gentype x)
-#gentype native_rsqrt (gentype x)
-#gentype native_sin (gentype x)
+gentype native_rsqrt (gentype x)
+gentype native_sin (gentype x)
 #gentype native_sqrt (gentype x)
-#gentype native_tan (gentype x)
+gentype native_tan (gentype x)
 
 ##integer
 ugentype abs (gentype x)
@@ -229,7 +229,6 @@ intn isless (floatn x, floatn y)
 longn isless (doublen x, doublen y)
 intn islessequal (floatn x, floatn y)
 longn islessequal (doublen x, doublen y)
-# XXX not implemented
 intn islessgreater (floatn x, floatn y)
 longn islessgreater (doublen x, doublen y)
 intn isfinite (floatn
@@ -240,18 +239,15 @@ intn isnan (floatn)
 longn isnan (doublen)
 intn isnormal (floatn)
 longn isnormal (doublen)
-# XXX not implemented
 intn isordered (floatn x, floatn y)
 longn isordered (doublen x, doublen y)
-# XXX not implemented
 intn isunordered (floatn x, floatn y)
 longn isunordered (doublen x, doublen y)
 intn signbit (floatn)
 longn signbit (doublen)
 int any (igentype x)
 int all (igentype x)
-# XXX need to revisit select latter
-#gentype bitselect (gentype a, gentype b, gentype c)
+gentype bitselect (gentype a, gentype b, gentype c)
 gentype select (gentype a, gentype b, igentype c)
 gentype select (gentype a, gentype b, ugentype c)
 
diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp
index 37e61e2..f813775 100644
--- a/backend/src/gbe_bin_generater.cpp
+++ b/backend/src/gbe_bin_generater.cpp
@@ -152,7 +152,7 @@ void program_build_instance::serialize_program(void) throw(int)
     ofstream ofs;
     ostringstream oss;
     size_t sz;
-    ofs.open(bin_path, ofstream::out | ofstream::app | ofstream::binary);
+    ofs.open(bin_path, ofstream::out | ofstream::trunc | ofstream::binary);
 
     if (str_fmt_out) {
         string array_name = "Unkown_name_array";
diff --git a/backend/src/gen_as.sh b/backend/src/gen_as.sh
index 626e6ec..7dea15d 100755
--- a/backend/src/gen_as.sh
+++ b/backend/src/gen_as.sh
@@ -6,6 +6,9 @@
 for type in $TYPES; do
         size=`IFS=:; set -- dummy $type; echo $3`
         for vector_length in $VECTOR_LENGTHS; do
+                if test $vector_length -eq 3; then
+                      continue;
+                fi
                 union_sizes="$union_sizes `expr $vector_length \* $size`"
         done
 done
@@ -21,7 +24,12 @@ for union_size in $union_sizes; do
                 basetype=`IFS=:; set -- dummy $type; echo $2`
                 basesize=`IFS=:; set -- dummy $type; echo $3`
                 for vector_length in $VECTOR_LENGTHS; do
-                        vector_size_in_union="`expr $vector_length \* $basesize`"
+                        if test $vector_length -eq 3; then
+                                vector_size_length="4"
+                        else
+                                vector_size_length=$vector_length;
+                        fi
+                        vector_size_in_union="`expr $vector_size_length \* $basesize`"
                         if test $union_size -ne $vector_size_in_union; then
                                 continue
                         fi
@@ -43,7 +51,12 @@ for union_size in $union_sizes; do
                 fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
                 fbasesize=`IFS=:; set -- dummy $ftype; echo $3`
                 for fvector_length in $VECTOR_LENGTHS; do
-                        fvector_size_in_union="`expr $fvector_length \* $fbasesize`"
+                        if test $fvector_length -eq 3; then
+                                fvector_size_length="4"
+                        else
+                                fvector_size_length=$fvector_length;
+                        fi
+                        fvector_size_in_union="`expr $fvector_size_length \* $fbasesize`"
                         if test $union_size -ne $fvector_size_in_union; then
                                 continue
                         fi
@@ -59,7 +72,12 @@ for union_size in $union_sizes; do
                                         continue
                                 fi
                                 for tvector_length in $VECTOR_LENGTHS; do
-                                        tvector_size_in_union="`expr $tvector_length \* $tbasesize`"
+                                        if test $tvector_length -eq 3; then
+                                               tvector_size_length="4"
+                                        else
+                                               tvector_size_length=$tvector_length;
+                                        fi
+                                        tvector_size_in_union="`expr $tvector_size_length \* $tbasesize`"
                                         if test $union_size -ne $tvector_size_in_union; then
                                                 continue
                                         fi
diff --git a/backend/src/gen_builtin_vector.py b/backend/src/gen_builtin_vector.py
index 0a30738..b100bbf 100755
--- a/backend/src/gen_builtin_vector.py
+++ b/backend/src/gen_builtin_vector.py
@@ -25,8 +25,8 @@ import sys
 import os
 
 if len(sys.argv) != 3:
-    print "Invalid argument {}".format(sys.argv)
-    print "use {} spec_file_name output_file_name".format(sys.argv[0])
+    print "Invalid argument {0}".format(sys.argv)
+    print "use {0} spec_file_name output_file_name".format(sys.argv[0])
     raise
 
 all_vector = 1,2,3,4,8,16
@@ -223,7 +223,7 @@ class builtinProto():
         return ret
 
     def init_from_line(self, t):
-        self.append('//{}'.format(t))
+        self.append('//{0}'.format(t))
         line = filter(None, re.split(',| |\(', t.rstrip(')\n')))
         self.paramCount = 0
         stripped = 0
@@ -264,7 +264,7 @@ class builtinProto():
                 return
 
             if (n == 0):
-                formatStr = 'INLINE_OVERLOADABLE {}{} {} ('.format(vtype[0], vtype[1], self.functionName)
+                formatStr = 'INLINE_OVERLOADABLE {0}{1} {2} ('.format(vtype[0], vtype[1], self.functionName)
             else:
                 formatStr += ', '
 
@@ -278,12 +278,12 @@ class builtinProto():
                 pointerStr = ''
 
             if ptype[1] != 1:
-                formatStr += '{}{} {}param{}'.format(ptype[0], ptype[1], pointerStr, n)
+                formatStr += '{0}{1} {2}param{3}'.format(ptype[0], ptype[1], pointerStr, n)
             else:
-                formatStr += '{} {}param{}'.format(ptype[0], pointerStr, n)
+                formatStr += '{0} {1}param{2}'.format(ptype[0], pointerStr, n)
 
         formatStr += ')'
-        formatStr = self.append(formatStr, '{{return ({}{})('.format(vtype[0], vtype[1]))
+        formatStr = self.append(formatStr, '{{return ({0}{1})('.format(vtype[0], vtype[1]))
         self.indent = len(formatStr)
         for j in range(0, vtype[1]):
             if (j != 0):
@@ -293,7 +293,9 @@ class builtinProto():
                 if j % 2 == 0:
                     formatStr = self.append(formatStr, self.indentSpace())
 
-            formatStr += '{}('.format(self.functionName)
+            if self.prefix == 'relational' and self.functionName != 'bitselect' and self.functionName != 'select':
+                formatStr += '-'
+            formatStr += '{0}('.format(self.functionName)
             for n in range(0, self.paramCount):
                 if n != 0:
                     formatStr += ', '
@@ -305,16 +307,16 @@ class builtinProto():
                         raise "parameter is not a scalar but has different width with result value."
                     if isPointer(ptype):
                         formatStr += '&'
-                    formatStr += 'param{}'.format(n)
+                    formatStr += 'param{0}'.format(n)
                     continue
 
                 if (isPointer(ptype)):
-                    formatStr += '({} {} *)param{} + {:2d}'.format(ptype[2], ptype[0], n, j)
+                    formatStr += '({0} {1} *)param{2} + {3:2d}'.format(ptype[2], ptype[0], n, j)
                 else:
                     if (self.functionName == 'select' and n == 2):
                         formatStr += '({0})(param{1}.s{2:x} & (({0})1 << (sizeof({0})*8 - 1)))'.format(ptype[0], n, j)
                     else:
-                        formatStr += 'param{}.s{:x}'.format(n, j)
+                        formatStr += 'param{0}.s{1:x}'.format(n, j)
 
             formatStr += ')'
 
@@ -329,7 +331,7 @@ class builtinProto():
 
     def output(self, outFile):
         for line in self.outputStr:
-            outFile.write('{}\n'.format(line))
+            outFile.write('{0}\n'.format(line))
 
     def gen_proto_str(self):
         check_type([self.valueTypeStr] + self.paramTypeStrs)
@@ -359,7 +361,7 @@ tempHeaderFileName = sys.argv[2] + '.tmp'
 safeUnlink(headerFileName)
 tempHeader = open(tempHeaderFileName, 'w')
 
-tempHeader.write("//This file is autogenerated by {}.\n".format(sys.argv[0]))
+tempHeader.write("//This file is autogenerated by {0}.\n".format(sys.argv[0]))
 tempHeader.write("//Don't modify it manually.\n")
 
 functionProto = builtinProto()
@@ -368,7 +370,7 @@ for line in specFile:
         continue
     if line[0] == '#':
         if line[1] == '#':
-            sectionHeader = "//{} builtin functions".format(line[2:].rstrip())
+            sectionHeader = "//{0} builtin functions".format(line[2:].rstrip())
             sectionPrefix=(line[2:].split())[0]
         continue
     functionProto.init(sectionHeader, sectionPrefix)
diff --git a/backend/src/gen_convert.sh b/backend/src/gen_convert.sh
index 047cc19..f0562a7 100755
--- a/backend/src/gen_convert.sh
+++ b/backend/src/gen_convert.sh
@@ -9,9 +9,6 @@ for vector_length in $VECTOR_LENGTHS; do
             fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
             for ttype in $TYPES; do
               tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
-              if test $fbasetype = $tbasetype; then
-                continue
-              fi
               echo "INLINE OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) {"
               echo "  return ($tbasetype)v;"
               echo "}"
@@ -131,9 +128,18 @@ INLINE_OVERLOADABLE long convert_long_sat(ulong x) {
   return x > MAX ? MAX : x;
 }
 
-INLINE_OVERLOADABLE ulong convert_ulong_sat(long x) {
-  return x < 0 ? 0 : x;
-}
+#define DEF(DSTTYPE, SRCTYPE) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x < 0 ? 0 : x; \
+  }
+DEF(ushort, char);
+DEF(uint, char);
+DEF(uint, short);
+DEF(ulong, char);
+DEF(ulong, short);
+DEF(ulong, int);
+DEF(ulong, long);
+#undef DEF
 
 #define DEF(DSTTYPE, SRCTYPE) \
   INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
@@ -144,7 +150,6 @@ DEF(uchar, uchar);
 DEF(short, char);
 DEF(short, uchar);
 DEF(short, short);
-DEF(ushort, char);
 DEF(ushort, uchar);
 DEF(ushort, ushort);
 DEF(int, char);
@@ -152,9 +157,7 @@ DEF(int, uchar);
 DEF(int, short);
 DEF(int, ushort);
 DEF(int, int);
-DEF(uint, char);
 DEF(uint, uchar);
-DEF(uint, short);
 DEF(uint, ushort);
 DEF(uint, uint);
 DEF(long, char);
@@ -164,11 +167,8 @@ DEF(long, ushort);
 DEF(long, int);
 DEF(long, uint);
 DEF(long, long);
-DEF(ulong, char);
 DEF(ulong, uchar);
-DEF(ulong, short);
 DEF(ulong, ushort);
-DEF(ulong, int);
 DEF(ulong, uint);
 DEF(ulong, ulong);
 #undef DEF
@@ -224,3 +224,330 @@ for vector_length in $VECTOR_LENGTHS; do
     done
   done
 done
+
+echo '
+float __gen_ocl_rndz(float x);
+float __gen_ocl_rnde(float x);
+float __gen_ocl_rndu(float x);
+float __gen_ocl_rndd(float x);
+INLINE_OVERLOADABLE float __convert_float_rtz(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;
+  if((l > x && x > 0) || x >= 0x7fffffc000000000 ||
+     (l < x && x < 0)) {
+      u.u -= 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;  //can not use u.f < x
+  if(l < x && x < 0x7fffffc000000000) {
+    if(x > 0)
+      u.u = u.u + 1;
+    else
+      u.u = u.u - 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;  //avoid overflow
+  if(l > x || x >= 0x7fffffc000000000) {
+    if(x > 0)
+      u.u = u.u - 1;
+    else
+      u.u = u.u + 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(ulong x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong l = u.f;
+  if(l > x  || x >= 0xffffff8000000000)
+      u.u -= 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(ulong x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong l = u.f;  //can not use u.f < x
+  if(l < x && x < 0xffffff8000000000)
+    u.u = u.u + 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(ulong x)
+{
+  return __convert_float_rtz(x);
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long i = u.f;
+  if((i > x && x > 0) ||
+     (i < x && x < 0)) {
+      u.u -= 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  int i = u.f;
+  if(i < x) {
+    if(x > 0)
+      u.u += 1;
+    else
+      u.u -= 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long i = u.f;  //avoid overflow
+  if(i > x) {
+    if(x > 0)
+      u.u = u.u - 1;
+    else
+      u.u = u.u + 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(uint x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong i = u.f;
+  if(i > x)
+    u.u -= 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(uint x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  uint i = u.f;
+  if(i < x)
+    u.u += 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(uint x)
+{
+  return __convert_float_rtz(x);
+}
+'
+
+# convert_DSTTYPE_ROUNDING function
+for vector_length in $VECTOR_LENGTHS; do
+  for ftype in $TYPES; do
+    fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+    if test $fbasetype = "double"; then continue; fi
+
+    for ttype in $TYPES; do
+      tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+      if test $tbasetype = "double"; then continue; fi
+
+      if test $vector_length -eq 1; then
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rte($fbasetype x)"
+        if test $fbasetype = "float" -a $tbasetype != "float"; then
+          echo "{ return __gen_ocl_rnde(x); }"
+        else
+          echo "{ return x; }"
+        fi
+
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rtz($fbasetype x)"
+        if test $fbasetype = "float" -a $tbasetype != "float"; then
+          echo "{ return __gen_ocl_rndz(x); }"
+        elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+          echo "{ return __convert_${tbasetype}_rtz(x); }"
+        else
+          echo "{ return x; }"
+        fi
+
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rtp($fbasetype x)"
+        if test $fbasetype = "float" -a $tbasetype != "float"; then
+          echo "{ return __gen_ocl_rndu(x); }"
+        elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+          echo "{ return __convert_${tbasetype}_rtp(x); }"
+        else
+          echo "{ return x; }"
+        fi
+
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rtn($fbasetype x)"
+        if test $fbasetype = "float" -a $tbasetype != "float"; then
+          echo "{ return __gen_ocl_rndd(x); }"
+        elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+          echo "{ return __convert_${tbasetype}_rtn(x); }"
+        else
+          echo "{ return x; }"
+        fi
+
+        continue
+      fi
+
+      for rounding in $ROUNDING_MODES; do
+        fvectortype=$fbasetype$vector_length
+        tvectortype=$tbasetype$vector_length
+        conv="convert_${tbasetype}_${rounding}"
+
+        construct="$conv(v.s0)"
+        if test $vector_length -gt 1; then
+          construct="$construct, $conv(v.s1)"
+        fi
+        if test $vector_length -gt 2; then
+          construct="$construct, $conv(v.s2)"
+        fi
+        if test $vector_length -gt 3; then
+          construct="$construct, $conv(v.s3)"
+        fi
+        if test $vector_length -gt 4; then
+          construct="$construct, $conv(v.s4)"
+          construct="$construct, $conv(v.s5)"
+          construct="$construct, $conv(v.s6)"
+          construct="$construct, $conv(v.s7)"
+        fi
+        if test $vector_length -gt 8; then
+          construct="$construct, $conv(v.s8)"
+          construct="$construct, $conv(v.s9)"
+          construct="$construct, $conv(v.sA)"
+          construct="$construct, $conv(v.sB)"
+          construct="$construct, $conv(v.sC)"
+          construct="$construct, $conv(v.sD)"
+          construct="$construct, $conv(v.sE)"
+          construct="$construct, $conv(v.sF)"
+        fi
+
+        echo "INLINE OVERLOADABLE $tvectortype convert_${tvectortype}_${rounding}($fvectortype v) {"
+        echo "  return ($tvectortype)($construct);"
+        echo "}"
+        echo
+      done
+    done
+  done
+done
+
+# convert_DSTTYPE_sat_ROUNDING function
+for vector_length in $VECTOR_LENGTHS; do
+  for ftype in $TYPES; do
+    fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+    if test $fbasetype = "double"; then continue; fi
+
+    for ttype in $TYPES; do
+      tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+      if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
+
+      if test $vector_length -eq 1; then
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rte($fbasetype x)"
+        if test $fbasetype = "float"; then
+          echo "{ return convert_${tbasetype}_sat(__gen_ocl_rnde(x)); }"
+        else
+          echo "{ return convert_${tbasetype}_sat(x); }"
+        fi
+
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtz($fbasetype x)"
+        if test $fbasetype = "float"; then
+          echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndz(x)); }"
+        else
+          echo "{ return convert_${tbasetype}_sat(x); }"
+        fi
+
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtp($fbasetype x)"
+        if test $fbasetype = "float"; then
+          echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndu(x)); }"
+        else
+          echo "{ return convert_${tbasetype}_sat(x); }"
+        fi
+
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtn($fbasetype x)"
+        if test $fbasetype = "float"; then
+          echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndd(x)); }"
+        else
+          echo "{ return convert_${tbasetype}_sat(x); }"
+        fi
+
+        continue
+      fi
+
+      for rounding in $ROUNDING_MODES; do
+        fvectortype=$fbasetype$vector_length
+        tvectortype=$tbasetype$vector_length
+        conv="convert_${tbasetype}_sat_${rounding}"
+
+        construct="$conv(v.s0)"
+        if test $vector_length -gt 1; then
+          construct="$construct, $conv(v.s1)"
+        fi
+        if test $vector_length -gt 2; then
+          construct="$construct, $conv(v.s2)"
+        fi
+        if test $vector_length -gt 3; then
+          construct="$construct, $conv(v.s3)"
+        fi
+        if test $vector_length -gt 4; then
+          construct="$construct, $conv(v.s4)"
+          construct="$construct, $conv(v.s5)"
+          construct="$construct, $conv(v.s6)"
+          construct="$construct, $conv(v.s7)"
+        fi
+        if test $vector_length -gt 8; then
+          construct="$construct, $conv(v.s8)"
+          construct="$construct, $conv(v.s9)"
+          construct="$construct, $conv(v.sA)"
+          construct="$construct, $conv(v.sB)"
+          construct="$construct, $conv(v.sC)"
+          construct="$construct, $conv(v.sD)"
+          construct="$construct, $conv(v.sE)"
+          construct="$construct, $conv(v.sF)"
+        fi
+
+        echo "INLINE OVERLOADABLE $tvectortype convert_${tvectortype}_sat_${rounding}($fvectortype v) {"
+        echo "  return ($tvectortype)($construct);"
+        echo "}"
+        echo
+      done
+    done
+  done
+done
diff --git a/backend/src/genconfig.sh b/backend/src/genconfig.sh
index f55b670..689499e 100644
--- a/backend/src/genconfig.sh
+++ b/backend/src/genconfig.sh
@@ -7,4 +7,5 @@ TYPES="long:8 ulong:8 int:4 uint:4 short:2 ushort:2 char:1 uchar:1 double:8 floa
 # Supported vector lengths
 VECTOR_LENGTHS="1 2 3 4 8 16"
 
+ROUNDING_MODES="rte rtz rtp rtn"
 ## No user serviceable parts below here
diff --git a/backend/src/ir/context.cpp b/backend/src/ir/context.cpp
index 400a2a0..d6815e1 100644
--- a/backend/src/ir/context.cpp
+++ b/backend/src/ir/context.cpp
@@ -105,10 +105,10 @@ namespace ir {
     return index;
   }
 
-  void Context::input(const std::string &name, FunctionArgument::Type type, Register reg, uint32_t elementSize) {
+  void Context::input(const std::string &name, FunctionArgument::Type type, Register reg, uint32_t elementSize, uint32_t align) {
     GBE_ASSERTM(fn != NULL, "No function currently defined");
     GBE_ASSERTM(reg < fn->file.regNum(), "Out-of-bound register");
-    FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name);
+    FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name, align);
     fn->args.push_back(arg);
   }
 
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index a7337e6..adeaf6f 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -82,6 +82,12 @@ namespace ir {
       }
       return ImmediateIndex(0);
     }
+    INLINE ImmediateIndex newFloatImmediate(float x) {
+      return this->newImmediate(x);
+    }
+    INLINE ImmediateIndex newDoubleImmediate(double x) {
+      return this->newImmediate(x);
+    }
 
     /*! Set an immediate value */
     template <typename T> INLINE void setImmediate(ImmediateIndex index, T value) {
@@ -101,7 +107,7 @@ namespace ir {
     /*! Create a new label for the current function */
     LabelIndex label(void);
     /*! Append a new input register for the function */
-    void input(const std::string &name, FunctionArgument::Type type, Register reg, uint32_t elemSz = 0u);
+    void input(const std::string &name, FunctionArgument::Type type, Register reg, uint32_t elemSz = 0u, uint32_t align = 0);
     /*! Append a new output register for the function */
     void output(Register reg);
     /*! Get the immediate value */
@@ -143,6 +149,7 @@ namespace ir {
     }
     DECL_THREE_SRC_INSN(SEL);
     DECL_THREE_SRC_INSN(I64MADSAT);
+    DECL_THREE_SRC_INSN(MAD);
 #undef DECL_THREE_SRC_INSN
 
     /*! For all unary functions */
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index c15c292..71dcc1f 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -43,7 +43,7 @@ namespace ir {
   ///////////////////////////////////////////////////////////////////////////
 
   Function::Function(const std::string &name, const Unit &unit, Profile profile) :
-    name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0)
+    name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0), stackSize(0)
   {
     initProfile(*this);
     samplerSet = GBE_NEW(SamplerSet);
@@ -227,7 +227,7 @@ namespace ir {
         GBE_ASSERT(target != NULL);
         target->predecessors.insert(&bb);
         bb.successors.insert(target);
-        if (insn.isPredicated() == true) jumpToNext = &bb;
+        if ( insn.isPredicated() == true) jumpToNext = &bb;
       }
     });
   }
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 3d4733d..2468e73 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -106,11 +106,12 @@ namespace ir {
       SAMPLER           = 6
     };
     /*! Create a function input argument */
-    INLINE FunctionArgument(Type type, Register reg, uint32_t size, const std::string &name) :
-      type(type), reg(reg), size(size), name(name) {}
+    INLINE FunctionArgument(Type type, Register reg, uint32_t size, const std::string &name, uint32_t align) :
+      type(type), reg(reg), size(size), align(align), name(name) {}
     Type type;     //!< Gives the type of argument we have
     Register reg;  //!< Holds the argument
     uint32_t size; //!< == sizeof(void*) for ptr, sizeof(elem) for the rest
+    uint32_t align; //!< address alignment for the argument
     const std::string name; //!< Holds the function name for IR output
     GBE_STRUCT(FunctionArgument); // Use custom allocator
   };
@@ -309,6 +310,14 @@ namespace ir {
     SamplerSet* getSamplerSet(void) const {return samplerSet; }
     /*! Get image set in this function */
     ImageSet* getImageSet(void) const {return imageSet; }
+    /*! Set required work group size. */
+    void setCompileWorkGroupSize(size_t x, size_t y, size_t z) { compileWgSize[0] = x; compileWgSize[1] = y; compileWgSize[2] = z; }
+    /*! Get required work group size. */
+    const size_t *getCompileWorkGroupSize(void) const {return compileWgSize;}
+    /*! Get stack size. */
+    INLINE const uint32_t getStackSize(void) const { return this->stackSize; }
+    /*! Push stack size. */
+    INLINE void pushStackSize(uint32_t step) { this->stackSize += step; }
   private:
     friend class Context;           //!< Can freely modify a function
     std::string name;               //!< Function name
@@ -325,8 +334,11 @@ namespace ir {
     uint32_t simdWidth;             //!< 8 or 16 if forced, 0 otherwise
     bool useSLM;                    //!< Is SLM required?
     uint32_t slmSize;               //!< local variable size inside kernel function
-    SamplerSet *samplerSet;          //!< samplers used in this function.
-    ImageSet* imageSet;              //!< Image set in this function's arguments..
+    uint32_t stackSize;             //!< stack size for private memory.
+    SamplerSet *samplerSet;         //!< samplers used in this function.
+    ImageSet* imageSet;             //!< Image set in this function's arguments..
+    size_t compileWgSize[3];        //!< required work group size specified by
+                                    //   __attribute__((reqd_work_group_size(X, Y, Z))).
     GBE_CLASS(Function);            //!< Use custom allocator
   };
 
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp
index b901a12..8c34d70 100644
--- a/backend/src/ir/image.cpp
+++ b/backend/src/ir/image.cpp
@@ -64,6 +64,29 @@ namespace ir {
     setInfoOffset4Type(imageInfo, key.type, offset);
   }
 
+  Register ImageSet::appendInfo(ImageInfoKey key, Context *ctx)
+  {
+    auto it = infoRegMap.find(key.data);
+    if (it != infoRegMap.end())
+      return it->second;
+    Register reg = ctx->reg(FAMILY_DWORD);
+    infoRegMap.insert(std::make_pair(key.data, reg));
+    return reg;
+  }
+
+  void ImageSet::clearInfo()
+  {
+    struct ImageInfo *imageInfo;
+    for(auto &it : indexMap) {
+      imageInfo = it.second;
+      imageInfo->wSlot = -1;
+      imageInfo->hSlot = -1;
+      imageInfo->depthSlot = -1;
+      imageInfo->dataTypeSlot = -1;
+      imageInfo->channelOrderSlot = -1;
+    }
+  }
+
   void ImageSet::append(Register imageReg, Context *ctx)
   {
     ir::FunctionArgument *arg =  ctx->getFunction().getArg(imageReg);
diff --git a/backend/src/ir/image.hpp b/backend/src/ir/image.hpp
index c084c7d..cf388d4 100644
--- a/backend/src/ir/image.hpp
+++ b/backend/src/ir/image.hpp
@@ -47,6 +47,10 @@ namespace ir {
     void append(Register imageReg, Context *ctx);
     /*! Append an image info slot. */
     void appendInfo(ImageInfoKey key, uint32_t offset);
+    /*! Append an image info register. */
+    Register appendInfo(ImageInfoKey, Context *ctx);
+    /*! clear image info. */
+    void clearInfo();
     /*! Get the image's index(actual location). */
     const uint32_t getIdx(const Register imageReg) const;
     size_t getDataSize(void) { return regMap.size(); }
@@ -86,6 +90,7 @@ namespace ir {
   private:
     map<Register, struct ImageInfo *> regMap;
     map<uint32_t, struct ImageInfo *> indexMap;
+    map<uint32_t, Register> infoRegMap;
     GBE_CLASS(ImageSet);
   };
 } /* namespace ir */
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 9b3e699..95bcff5 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -243,19 +243,53 @@ namespace ir {
       INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
     };
 
+    class ALIGNED_INSTRUCTION BitCastInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<BitCastInstruction>,
+      public TupleDstPolicy<BitCastInstruction>
+    {
+    public:
+      BitCastInstruction(Type dstType,
+                         Type srcType,
+                         Tuple dst,
+                         Tuple src,
+                         uint8_t dstNum,
+                         uint8_t srcNum)
+      {
+        this->opcode = OP_BITCAST;
+        this->dst = dst;
+        this->src = src;
+        this->dstFamily = getFamily(dstType);
+        this->srcFamily = getFamily(srcType);
+        GBE_ASSERT(srcNum <= 16 && dstNum <= 16);
+        this->dstNum = dstNum;
+        this->srcNum = srcNum;
+      }
+      INLINE Type getSrcType(void) const { return getType((RegisterFamily)srcFamily); }
+      INLINE Type getDstType(void) const { return getType((RegisterFamily)dstFamily); }
+      INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      uint8_t dstFamily:4; //!< family to cast to
+      uint8_t srcFamily:4; //!< family to cast from
+      Tuple dst;
+      Tuple src;
+      uint8_t dstNum;     //!<Dst Number
+      uint8_t srcNum;     //!<Src Number
+    };
+
     class ALIGNED_INSTRUCTION ConvertInstruction :
       public BasePolicy,
       public NDstPolicy<ConvertInstruction, 1>,
       public NSrcPolicy<ConvertInstruction, 1>
     {
     public:
-      ConvertInstruction(Type dstType,
+      ConvertInstruction(Opcode opcode,
+                         Type dstType,
                          Type srcType,
                          Register dst,
-                         Register src,
-                         bool saturated=false)
+                         Register src)
       {
-        this->opcode = saturated ? OP_SAT_CVT : OP_CVT;
+        this->opcode = opcode;
         this->dst[0] = dst;
         this->src[0] = src;
         this->dstType = dstType;
@@ -457,37 +491,48 @@ namespace ir {
       public TupleDstPolicy<SampleInstruction>
     {
     public:
-      SampleInstruction(Tuple dstTuple, Tuple srcTuple, Type dstType, Type srcType) {
+      SampleInstruction(uint8_t imageIdx, Tuple dstTuple, Tuple srcTuple, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset, bool is3D) {
         this->opcode = OP_SAMPLE;
         this->dst = dstTuple;
         this->src = srcTuple;
-        this->dstType = dstType;
-        this->srcType = srcType;
+        this->dstIsFloat = dstIsFloat;
+        this->srcIsFloat = srcIsFloat;
+        this->samplerIdx = sampler;
+        this->imageIdx = imageIdx;
+        this->samplerOffset = samplerOffset;
+        this->is3DRead = is3D;
       }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const {
         this->outOpcode(out);
         out << "." << this->getDstType()
             << "." << this->getSrcType()
-            << " surface id %" << this->getSrc(fn, 0)
-            << " sampler %" << this->getSrc(fn, 1)
-            << " coord u %" << this->getSrc(fn, 2)
-            << " coord v %" << this->getSrc(fn, 3)
-            << " coord w %" << this->getSrc(fn, 4)
+            << " surface id " << (int)this->getImageIndex()
+            << " coord u %" << this->getSrc(fn, 0)
+            << " coord v %" << this->getSrc(fn, 1)
+            << " coord w %" << this->getSrc(fn, 2)
             << " %" << this->getDst(fn, 0)
             << " %" << this->getDst(fn, 1)
             << " %" << this->getDst(fn, 2)
-            << " %" << this->getDst(fn, 3);
+            << " %" << this->getDst(fn, 3)
+            << " sampler idx " << (int)this->getSamplerIndex();
       }
       Tuple src;
       Tuple dst;
-      Type srcType;
-      Type dstType;
 
-      INLINE Type getSrcType(void) const { return this->srcType; }
-      INLINE Type getDstType(void) const { return this->dstType; }
-
-      static const uint32_t srcNum = 6;
+      INLINE const uint8_t getImageIndex(void) const { return this->imageIdx; }
+      INLINE Type getSrcType(void) const { return this->srcIsFloat ? TYPE_FLOAT : TYPE_S32; }
+      INLINE Type getDstType(void) const { return this->dstIsFloat ? TYPE_FLOAT : TYPE_U32; }
+      INLINE const uint8_t getSamplerIndex(void) const { return this->samplerIdx; }
+      INLINE const uint8_t getSamplerOffset(void) const { return this->samplerOffset; }
+      INLINE const bool is3D(void) const { return !!this->is3DRead; }
+      uint8_t srcIsFloat:1;
+      uint8_t dstIsFloat:1;
+      uint8_t samplerIdx:4;
+      uint8_t samplerOffset:1;
+      uint8_t is3DRead:1;
+      uint8_t imageIdx;
+      static const uint32_t srcNum = 3;
       static const uint32_t dstNum = 4;
     };
 
@@ -498,34 +543,41 @@ namespace ir {
     {
     public:
 
-      INLINE TypedWriteInstruction(Tuple srcTuple, Type srcType, Type coordType) {
+      INLINE TypedWriteInstruction(uint8_t imageIdx, Tuple srcTuple, Type srcType, Type coordType, bool is3D) {
         this->opcode = OP_TYPED_WRITE;
         this->src = srcTuple;
         this->coordType = coordType;
         this->srcType = srcType;
+        this->imageIdx = imageIdx;
+        this->is3DWrite = is3D;
       }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const {
         this->outOpcode(out);
         out << "." << this->getSrcType()
-            << " surface id %" << this->getSrc(fn, 0)
-            << " coord u %" << this->getSrc(fn, 1)
-            << " coord v %" << this->getSrc(fn, 2)
-            << " coord w %" << this->getSrc(fn, 3)
+            << " surface id " << (int)this->getImageIndex()
+            << " coord u %" << this->getSrc(fn, 0)
+            << " coord v %" << this->getSrc(fn, 1)
+            << " coord w %" << this->getSrc(fn, 2)
+            << " %" << this->getSrc(fn, 3)
             << " %" << this->getSrc(fn, 4)
             << " %" << this->getSrc(fn, 5)
-            << " %" << this->getSrc(fn, 6)
-            << " %" << this->getSrc(fn, 7);
+            << " %" << this->getSrc(fn, 6);
       }
 
       Tuple src;
-      Type srcType;
-      Type coordType;
+      uint8_t srcType;
+      uint8_t coordType;
+      uint8_t imageIdx;
+      uint8_t is3DWrite;
 
-      INLINE Type getSrcType(void) const { return this->srcType; }
-      INLINE Type getCoordType(void) const { return this->coordType; }
+      INLINE const bool is3D(void) const { return !!this->is3DWrite; }
+
+      INLINE const uint8_t getImageIndex(void) const { return this->imageIdx; }
+      INLINE Type getSrcType(void) const { return (Type)this->srcType; }
+      INLINE Type getCoordType(void) const { return (Type)this->coordType; }
       // bti, u, v, w, 4 data elements
-      static const uint32_t srcNum = 8;
+      static const uint32_t srcNum = 7;
       Register dst[0];               //!< No dest register
     };
 
@@ -536,41 +588,48 @@ namespace ir {
     {
     public:
       GetSamplerInfoInstruction( Register dst,
-                                 Register src)
+                                 Register samplerInfo,
+                                 uint8_t samplerIdx)
       {
         this->opcode = OP_GET_SAMPLER_INFO;
         this->dst[0] = dst;
-        this->src[0] = src;
+        this->src[0] = samplerInfo;
+        this->samplerIdx = samplerIdx;
       }
 
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const {
         this->outOpcode(out);
-        out << " sampler id %" << this->getSrc(fn, 0)
-            << " %" << this->getDst(fn, 0);
+        out  << " %" << this->getDst(fn, 0)
+             << " %" << this->getSrc(fn, 0)
+             << " sampler idx " << (int)this->samplerIdx;
+      }
+      INLINE const uint8_t getSamplerIndex() const {
+        return this->samplerIdx;
       }
 
-      Register src[1];                  //!< Surface to get info
+      Register src[1];                  //!< sampler to get info
       Register dst[1];                  //!< return value
+      uint8_t samplerIdx;               //!< sampler slot index.
       static const uint32_t dstNum = 1;
     };
 
     class ALIGNED_INSTRUCTION GetImageInfoInstruction :
       public BasePolicy,
-      public NSrcPolicy<GetImageInfoInstruction, 2>,
+      public NSrcPolicy<GetImageInfoInstruction, 1>,
       public NDstPolicy<GetImageInfoInstruction, 1>
     {
     public:
       GetImageInfoInstruction( int type,
                                Register dst,
-                               Register src,
+                               uint8_t imageIdx,
                                Register infoReg)
       {
         this->opcode = OP_GET_IMAGE_INFO;
         this->infoType = type;
         this->dst[0] = dst;
-        this->src[0] = src;
-        this->src[1] = infoReg;
+        this->src[0] = infoReg;
+        this->imageIdx = imageIdx;
       }
 
       INLINE uint32_t getInfoType(void) const { return infoType; }
@@ -578,13 +637,17 @@ namespace ir {
       INLINE void out(std::ostream &out, const Function &fn) const {
         this->outOpcode(out);
         out << "." << this->getInfoType()
-            << " surface id %" << this->getSrc(fn, 0)
-            << " %" << this->getDst(fn, 0);
+            << " %" << this->getDst(fn, 0)
+            << " surface id " << (int)this->getImageIndex()
+            << " info reg %" << this->getSrc(fn, 0);
       }
 
+      INLINE const uint8_t getImageIndex(void) const { return imageIdx; }
+
       uint8_t infoType;                 //!< Type of the requested information.
-      Register src[2];                  //!< Surface to get info
-      Register dst[1];                        //!< dest register to put the information.
+      uint8_t imageIdx;                //!< surface index.
+      Register src[1];                  //!< surface info register.
+      Register dst[1];                  //!< dest register to put the information.
       static const uint32_t dstNum = 1;
     };
 
@@ -750,6 +813,8 @@ namespace ir {
         default:
           CHECK_TYPE(this->type, allButBool);
           break;
+        case OP_MOV:
+          break;
         case OP_POW:
         case OP_COS:
         case OP_SIN:
@@ -809,6 +874,35 @@ namespace ir {
       return true;
     }
 
+    // The bit sizes of src and the dst must be identical, and don't support bool now, bool need double check.
+    INLINE bool BitCastInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+        if (UNLIKELY(checkSpecialRegForWrite(getDst(fn, dstID), fn, whyNot) == false))
+          return false;
+        if (UNLIKELY(checkRegisterData((RegisterFamily)dstFamily, getDst(fn, dstID), fn, whyNot) == false))
+          return false;
+      }
+      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+        if (UNLIKELY(checkRegisterData((RegisterFamily)srcFamily, getSrc(fn, srcID), fn, whyNot) == false))
+          return false;
+      }
+
+      CHECK_TYPE(getType((RegisterFamily)dstFamily), allButBool);
+      CHECK_TYPE(getType((RegisterFamily)srcFamily), allButBool);
+
+      uint32_t dstBytes = 0, srcBtyes = 0;
+      dstBytes = dstNum * getFamilySize((RegisterFamily)dstFamily);
+      srcBtyes = srcNum * getFamilySize((RegisterFamily)srcFamily);
+
+      if(dstBytes != srcBtyes){
+        whyNot = " The bit sizes of src and the dst is not identical.";
+        return false;
+      }
+
+      return true;
+    }
+
     // We can convert anything to anything, but types and families must match
     INLINE bool ConvertInstruction::wellFormed(const Function &fn, std::string &whyNot) const
     {
@@ -1020,6 +1114,22 @@ namespace ir {
         out << " %" << this->getSrc(fn, i);
     }
 
+
+    INLINE void BitCastInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      out << "." << this->getDstType()
+          << "." << this->getSrcType();
+      out << " {";
+      for (uint32_t i = 0; i < dstNum; ++i)
+        out << "%" << this->getDst(fn, i) << (i != (dstNum-1u) ? " " : "");
+      out << "}";
+      out << " {";
+      for (uint32_t i = 0; i < srcNum; ++i)
+        out << "%" << this->getSrc(fn, i) << (i != (srcNum-1u) ? " " : "");
+      out << "}";
+    }
+
+
     INLINE void ConvertInstruction::out(std::ostream &out, const Function &fn) const {
       this->outOpcode(out);
       out << "." << this->getDstType()
@@ -1142,6 +1252,10 @@ START_INTROSPECTION(CompareInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(CompareInstruction)
 
+START_INTROSPECTION(BitCastInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(BitCastInstruction)
+
 START_INTROSPECTION(ConvertInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(ConvertInstruction)
@@ -1328,6 +1442,15 @@ END_FUNCTION(Instruction, Register)
     fn.deleteInstruction(this);
   }
 
+  void Instruction::insert(Instruction *prev, Instruction ** new_ins) {
+    Function &fn = prev->getFunction();
+    Instruction *insn = fn.newInstruction(*this);
+    insn->parent = prev->parent;
+    append(insn, prev);
+    if (new_ins)
+      *new_ins = insn;
+  }
+
   bool Instruction::hasSideEffect(void) const {
     return opcode == OP_STORE ||
            opcode == OP_TYPED_WRITE ||
@@ -1346,6 +1469,8 @@ DECL_MEM_FN(BinaryInstruction, bool, commutes(void), commutes())
 DECL_MEM_FN(SelectInstruction, Type, getType(void), getType())
 DECL_MEM_FN(TernaryInstruction, Type, getType(void), getType())
 DECL_MEM_FN(CompareInstruction, Type, getType(void), getType())
+DECL_MEM_FN(BitCastInstruction, Type, getSrcType(void), getSrcType())
+DECL_MEM_FN(BitCastInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
@@ -1365,9 +1490,17 @@ DECL_MEM_FN(BranchInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
 DECL_MEM_FN(SyncInstruction, uint32_t, getParameters(void), getParameters())
 DECL_MEM_FN(SampleInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(SampleInstruction, Type, getDstType(void), getDstType())
+DECL_MEM_FN(SampleInstruction, const uint8_t, getSamplerIndex(void), getSamplerIndex())
+DECL_MEM_FN(SampleInstruction, const bool, is3D(void), is3D())
+DECL_MEM_FN(SampleInstruction, const uint8_t, getSamplerOffset(void), getSamplerOffset())
+DECL_MEM_FN(SampleInstruction, const uint8_t, getImageIndex(void), getImageIndex())
 DECL_MEM_FN(TypedWriteInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(TypedWriteInstruction, Type, getCoordType(void), getCoordType())
+DECL_MEM_FN(TypedWriteInstruction, const uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(TypedWriteInstruction, const bool, is3D(void), is3D())
 DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
+DECL_MEM_FN(GetImageInfoInstruction, const uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(GetSamplerInfoInstruction, const uint8_t, getSamplerIndex(void), getSamplerIndex())
 
 #undef DECL_MEM_FN
 
@@ -1449,6 +1582,9 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
     return internal::TernaryInstruction(OP_I64MADSAT, type, dst, src).convert();
   }
 
+  Instruction MAD(Type type, Register dst, Tuple src) {
+    return internal::TernaryInstruction(OP_MAD, type, dst, src).convert();
+  }
   // All compare functions
 #define DECL_EMIT_FUNCTION(NAME) \
   Instruction NAME(Type type, Register dst,  Register src0, Register src1) { \
@@ -1462,17 +1598,33 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
   DECL_EMIT_FUNCTION(LT)
   DECL_EMIT_FUNCTION(GE)
   DECL_EMIT_FUNCTION(GT)
+  DECL_EMIT_FUNCTION(ORD)
 
 #undef DECL_EMIT_FUNCTION
 
+  // BITCAST
+  Instruction BITCAST(Type dstType, Type srcType, Tuple dst, Tuple src, uint8_t dstNum, uint8_t srcNum) {
+    return internal::BitCastInstruction(dstType, srcType, dst, src, dstNum, srcNum).convert();
+  }
+
   // CVT
   Instruction CVT(Type dstType, Type srcType, Register dst, Register src) {
-    return internal::ConvertInstruction(dstType, srcType, dst, src).convert();
+    return internal::ConvertInstruction(OP_CVT, dstType, srcType, dst, src).convert();
   }
 
   // saturated convert
   Instruction SAT_CVT(Type dstType, Type srcType, Register dst, Register src) {
-    return internal::ConvertInstruction(dstType, srcType, dst, src, true).convert();
+    return internal::ConvertInstruction(OP_SAT_CVT, dstType, srcType, dst, src).convert();
+  }
+
+  // CVT
+  Instruction F16TO32(Type dstType, Type srcType, Register dst, Register src) {
+    return internal::ConvertInstruction(OP_F16TO32, dstType, srcType, dst, src).convert();
+  }
+
+  // saturated convert
+  Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src) {
+    return internal::ConvertInstruction(OP_F32TO16, dstType, srcType, dst, src).convert();
   }
 
   // For all unary functions with given opcode
@@ -1526,20 +1678,20 @@ DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
   }
 
   // SAMPLE
-  Instruction SAMPLE(Tuple dst, Tuple src, Type dstType, Type srcType) {
-    return internal::SampleInstruction(dst, src, dstType, srcType).convert();
+  Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset, bool is3D) {
+    return internal::SampleInstruction(imageIndex, dst, src, dstIsFloat, srcIsFloat, sampler, samplerOffset, is3D).convert();
   }
 
-  Instruction TYPED_WRITE(Tuple src, Type srcType, Type coordType) {
-    return internal::TypedWriteInstruction(src, srcType, coordType).convert();
+  Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, Type srcType, Type coordType, bool is3D) {
+    return internal::TypedWriteInstruction(imageIndex, src, srcType, coordType, is3D).convert();
   }
 
-  Instruction GET_IMAGE_INFO(int infoType, Register dst, Register src, Register infoReg) {
-    return internal::GetImageInfoInstruction(infoType, dst, src, infoReg).convert();
+  Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg) {
+    return internal::GetImageInfoInstruction(infoType, dst, imageIndex, infoReg).convert();
   }
 
-  Instruction GET_SAMPLER_INFO(Register dst, Register src) {
-    return internal::GetSamplerInfoInstruction(dst, src).convert();
+  Instruction GET_SAMPLER_INFO(Register dst, Register samplerInfo, uint8_t samplerIdx) {
+    return internal::GetSamplerInfoInstruction(dst, samplerInfo, samplerIdx).convert();
   }
 
   std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 90c819b..8e2cd11 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -170,14 +170,16 @@ namespace ir {
     void replace(Instruction *other) const;
     /*! Remove the instruction from the instruction stream */
     void remove(void);
+    /* Insert the instruction after the previous one. */
+    void insert(Instruction *prev, Instruction ** new_ins = NULL);
     /*! Indicates if the instruction belongs to instruction type T. Typically, T
      *  can be BinaryInstruction, UnaryInstruction, LoadInstruction and so on
      */
     template <typename T> INLINE bool isMemberOf(void) const {
       return T::isClassOf(*this);
     }
-    static const uint32_t MAX_SRC_NUM = 8;
-    static const uint32_t MAX_DST_NUM = 8;
+    static const uint32_t MAX_SRC_NUM = 16;
+    static const uint32_t MAX_DST_NUM = 16;
   protected:
     BasicBlock *parent;      //!< The basic block containing the instruction
     GBE_CLASS(Instruction);  //!< Use internal allocators
@@ -241,6 +243,17 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  /*! BitCast instruction converts from one type to another */
+  class BitCastInstruction : public Instruction {
+  public:
+    /*! Get the type of the source */
+    Type getSrcType(void) const;
+    /*! Get the type of the destination */
+    Type getDstType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   /*! Conversion instruction converts from one type to another */
   class ConvertInstruction : public Instruction {
   public:
@@ -336,11 +349,10 @@ namespace ir {
   /*! Store data in an texture */
   class TypedWriteInstruction : public Instruction {
   public:
-    enum {
-     SURFACE_BTI = 0
-    };
     /*! Return true if the given instruction is an instance of this class */
+    const bool is3D() const;
     static bool isClassOf(const Instruction &insn);
+    const uint8_t getImageIndex() const;
     Type getSrcType(void) const;
     Type getCoordType(void) const;
   };
@@ -348,17 +360,18 @@ namespace ir {
   /*! Load texels from a texture */
   class SampleInstruction : public Instruction {
   public:
-    enum {
-     SURFACE_BTI = 0,
-     SAMPLER_BTI = 1
-    };
-    /*! Return true if the given instruction is an instance of this class */
-    static bool isClassOf(const Instruction &insn);
+    const bool is3D() const;
+    const uint8_t getImageIndex() const;
+    const uint8_t getSamplerIndex(void) const;
+    const uint8_t getSamplerOffset(void) const;
     Type getSrcType(void) const;
     Type getDstType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
   };
 
-  typedef union {
+  typedef union _ImageInfoKey{
+    _ImageInfoKey(uint8_t i, uint8_t t) : index(i), type(t) {};
     struct {
      uint8_t index; /*! the allocated image index */
      uint8_t  type;  /*! the information type */
@@ -370,9 +383,6 @@ namespace ir {
   class GetImageInfoInstruction : public Instruction {
   public:
     enum {
-     SURFACE_BTI = 0
-    };
-    enum {
      WIDTH = 0,
      HEIGHT = 1,
      DEPTH = 2,
@@ -395,6 +405,7 @@ namespace ir {
      return 0;
    }
 
+    const uint8_t getImageIndex() const;
     uint32_t getInfoType() const;
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
@@ -404,6 +415,7 @@ namespace ir {
   class GetSamplerInfoInstruction : public Instruction {
   public:
 
+    const uint8_t getSamplerIndex(void) const;
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
   };
@@ -539,6 +551,8 @@ namespace ir {
   Instruction I64_MUL_HI(Type type, Register dst, Register src0, Register src1);
   /*! i64madsat.type dst src */
   Instruction I64MADSAT(Type type, Register dst, Tuple src);
+  /*! mad.type dst src */
+  Instruction MAD(Type type, Register dst, Tuple src);
   /*! upsample_short.type dst src */
   Instruction UPSAMPLE_SHORT(Type type, Register dst, Register src0, Register src1);
   /*! upsample_int.type dst src */
@@ -563,6 +577,8 @@ namespace ir {
   Instruction ABS(Type type, Register dst, Register src);
   /*! log.type dst src */
   Instruction LOG(Type type, Register dst, Register src);
+  /*! exp.type dst src */
+  Instruction EXP(Type type, Register dst, Register src);
   /*! sqr.type dst src */
   Instruction SQR(Type type, Register dst, Register src);
   /*! rsq.type dst src */
@@ -621,10 +637,18 @@ namespace ir {
   Instruction GE(Type type, Register dst, Register src0, Register src1);
   /*! ge.type dst src0 src1 */
   Instruction GT(Type type, Register dst, Register src0, Register src1);
+  /*! ord.type dst src0 src1 */
+  Instruction ORD(Type type, Register dst, Register src0, Register src1);
+  /*! BITCAST.{dstType <- srcType} dst src */
+  Instruction BITCAST(Type dstType, Type srcType, Tuple dst, Tuple src, uint8_t dstNum, uint8_t srcNum);
   /*! cvt.{dstType <- srcType} dst src */
   Instruction CVT(Type dstType, Type srcType, Register dst, Register src);
   /*! sat_cvt.{dstType <- srcType} dst src */
   Instruction SAT_CVT(Type dstType, Type srcType, Register dst, Register src);
+  /*! F16TO32.{dstType <- srcType} dst src */
+  Instruction F16TO32(Type dstType, Type srcType, Register dst, Register src);
+  /*! F32TO16.{dstType <- srcType} dst src */
+  Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src);
   /*! atomic dst addr.space {src1 {src2}} */
   Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, Tuple src);
   /*! bra labelIndex */
@@ -642,13 +666,13 @@ namespace ir {
   /*! sync.params... (see Sync instruction) */
   Instruction SYNC(uint32_t parameters);
   /*! typed write */
-  Instruction TYPED_WRITE(Tuple src, Type srcType, Type coordType);
+  Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, Type srcType, Type coordType, bool is3D);
   /*! sample textures */
-  Instruction SAMPLE(Tuple dst, Tuple src, Type dstType, Type srcType);
+  Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset, bool is3D);
   /*! get image information , such as width/height/depth/... */
-  Instruction GET_IMAGE_INFO(int infoType, Register dst, Register src, Register infoReg);
+  Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg);
   /*! get sampler information  */
-  Instruction GET_SAMPLER_INFO(Register dst, Register src);
+  Instruction GET_SAMPLER_INFO(Register dst, Register samplerInfo, uint8_t index);
   /*! label labelIndex */
   Instruction LABEL(LabelIndex labelIndex);
 
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index cd60349..bb5229a 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -29,6 +29,7 @@ DECL_INSN(MOV, UnaryInstruction)
 DECL_INSN(COS, UnaryInstruction)
 DECL_INSN(SIN, UnaryInstruction)
 DECL_INSN(LOG, UnaryInstruction)
+DECL_INSN(EXP, UnaryInstruction)
 DECL_INSN(SQR, UnaryInstruction)
 DECL_INSN(RSQ, UnaryInstruction)
 DECL_INSN(RCP, UnaryInstruction)
@@ -60,8 +61,12 @@ DECL_INSN(LE, CompareInstruction)
 DECL_INSN(LT, CompareInstruction)
 DECL_INSN(GE, CompareInstruction)
 DECL_INSN(GT, CompareInstruction)
+DECL_INSN(ORD, CompareInstruction)
+DECL_INSN(BITCAST, BitCastInstruction)
 DECL_INSN(CVT, ConvertInstruction)
 DECL_INSN(SAT_CVT, ConvertInstruction)
+DECL_INSN(F16TO32, ConvertInstruction)
+DECL_INSN(F32TO16, ConvertInstruction)
 DECL_INSN(ATOMIC, AtomicInstruction)
 DECL_INSN(BRA, BranchInstruction)
 DECL_INSN(RET, BranchInstruction)
@@ -86,3 +91,4 @@ DECL_INSN(UPSAMPLE_SHORT, BinaryInstruction)
 DECL_INSN(UPSAMPLE_INT, BinaryInstruction)
 DECL_INSN(UPSAMPLE_LONG, BinaryInstruction)
 DECL_INSN(I64MADSAT, TernaryInstruction)
+DECL_INSN(MAD, TernaryInstruction)
diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
index b0a4314..724d5c3 100644
--- a/backend/src/ir/liveness.cpp
+++ b/backend/src/ir/liveness.cpp
@@ -29,9 +29,29 @@ namespace ir {
 
   Liveness::Liveness(Function &fn) : fn(fn) {
     // Initialize UEVar and VarKill for each block
-    fn.foreachBlock([this](const BasicBlock &bb) { this->initBlock(bb); });
-    // Now with iterative analysis, we compute liveout sets
-    this->computeLiveOut();
+    fn.foreachBlock([this](const BasicBlock &bb) {
+      this->initBlock(bb);
+      // If the bb has ret instruction, add it to the work list set.
+      const Instruction *lastInsn = bb.getLastInstruction();
+      const ir::Opcode op = lastInsn->getOpcode();
+      struct BlockInfo * info = liveness[&bb];
+      if (op == OP_RET) {
+        workSet.insert(info);
+        info->liveOut.insert(ocl::retVal);
+      } else if (op == OP_BRA) {
+        // If this is a backward jump, put it to the extra work list.
+        if (((BranchInstruction*)lastInsn)->getLabelIndex() < bb.getLabelIndex())
+          extraWorkSet.insert(info);
+      }
+    });
+    // Now with iterative analysis, we compute liveout and livein sets
+    this->computeLiveInOut();
+    for (auto it : extraWorkSet) {
+      for (auto reg : it->liveOut) {
+        it->extraLiveIn.insert(reg);
+      }
+    }
+    this->computeExtraLiveInOut();
   }
 
   Liveness::~Liveness(void) {
@@ -65,30 +85,134 @@ namespace ir {
     }
   }
 
-  void Liveness::computeLiveOut(void) {
-    // First insert the UEVar from the successors
-    foreach<DF_SUCC>([](BlockInfo &info, const BlockInfo &succ) {
-      const UEVar &ueVarSet = succ.upwardUsed;
-      // Iterate over all the registers in the UEVar of our successor
-      for (auto ueVar : ueVarSet) info.liveOut.insert(ueVar);
+// Use simple backward data flow analysis to solve the liveness problem.
+  void Liveness::computeLiveInOut(void) {
+    while(!workSet.empty()) {
+      auto currInfo = *workSet.begin();
+      workSet.erase(currInfo);
+      for (auto currOutVar : currInfo->liveOut)
+        if (!currInfo->varKill.contains(currOutVar))
+          currInfo->upwardUsed.insert(currOutVar);
+      bool isChanged = false;
+      for (auto prev : currInfo->bb.getPredecessorSet()) {
+        BlockInfo *prevInfo = liveness[prev];
+        for (auto currInVar : currInfo->upwardUsed) {
+          auto changed = prevInfo->liveOut.insert(currInVar);
+          if (changed.second) isChanged = true;
+        }
+        if (isChanged )
+          workSet.insert(prevInfo);
+      }
+    };
+#if 0
+    fn.foreachBlock([this](const BasicBlock &bb){
+      printf("label %d:\n", bb.getLabelIndex());
+      BlockInfo *info = liveness[&bb];
+      auto &outVarSet = info->liveOut;
+      auto &inVarSet = info->upwardUsed;
+      auto &extraInVarSet = info->extraLiveIn;
+      auto &extraOutVarSet = info->extraLiveOut;
+      printf("\n\tin Lives: ");
+      for (auto inVar : inVarSet) {
+        printf("%d ", inVar);
+      }
+      printf("\n");
+      printf("\tout Lives: ");
+      for (auto outVar : outVarSet) {
+        printf("%d ", outVar);
+      }
+      printf("\n");
+
     });
-    // Now iterate on liveOut
-    bool changed = true;
-    while (changed) {
-      changed = false;
-      foreach<DF_SUCC>([&changed](BlockInfo &info, const BlockInfo &succ) {
-        const UEVar &killSet = succ.varKill;
-        const LiveOut &liveOut = succ.liveOut;
-        // Iterate over all the registers in the UEVar of our successor
-        for (auto living : liveOut) {
-          if (killSet.contains(living)) continue;
-          if (info.liveOut.contains(living)) continue;
-          info.liveOut.insert(living);
-          changed = true;
+#endif
+   }
+
+/*
+  Consider the following scenario, %100's normal liveness will start from Ln-1's
+  position. In normal analysis, the Ln-1 is not Ln's predecessor, thus the liveness
+  of %100 will be passed to Ln and then will not be passed to L0.
+
+  But considering we are running on a multilane with predication's vector machine.
+  The unconditional BR in Ln-1 may be removed and it will enter Ln with a subset of
+  the revert set of Ln-1's predication. For example when running Ln-1, the active lane
+  is 0-7, then at Ln the active lane is 8-15. Then at the end of Ln, a subset of 8-15
+  will jump to L0. If a register %10 is allocated the same GRF as %100, given the fact
+  that their normal liveness doesn't overlapped, the a subset of 8-15 lanes will be
+  modified. If the %10 and %100 are the same vector data type, then we are fine. But if
+  %100 is a float vector, and the %10 is a bool or short vector, then we hit a bug here.
+
+L0:
+  ...
+  %10 = 5
+  ...
+Ln-1:
+  %100 = 2
+  BR Ln+1
+
+Ln:
+  ...
+  BR(%xxx) L0
+
+Ln+1:
+  %101 = %100 + 2;
+  ...
+
+  The solution to fix this issue is to build another liveness data. We will start with
+  those BBs with backward jump. Then pass all the liveOut register as extra liveIn
+  of current BB and then forward this extra liveIn to all the blocks. This is very similar
+  to the normal liveness analysis just with reverse direction.
+*/
+  void Liveness::computeExtraLiveInOut(void) {
+    while(!extraWorkSet.empty()) {
+      struct BlockInfo *currInfo = *extraWorkSet.begin();
+      extraWorkSet.erase(currInfo);
+      for (auto currInVar : currInfo->extraLiveIn)
+        currInfo->extraLiveOut.insert(currInVar);
+      bool isChanged = false;
+      for (auto succ : currInfo->bb.getSuccessorSet()) {
+        BlockInfo *succInfo = liveness[succ];
+        for (auto currOutVar : currInfo->extraLiveOut) {
+          bool changed = false;
+          if (!succInfo->upwardUsed.contains(currOutVar)) {
+            auto it  = succInfo->extraLiveIn.insert(currOutVar);
+            changed = it.second;
+          }
+          if (changed) isChanged = true;
         }
-      });
-    }
-  }
+        if (isChanged)
+          extraWorkSet.insert(succInfo);}
+    };
+#if 0
+    fn.foreachBlock([this](const BasicBlock &bb){
+      printf("label %d:\n", bb.getLabelIndex());
+      BlockInfo *info = liveness[&bb];
+      auto &outVarSet = info->liveOut;
+      auto &inVarSet = info->upwardUsed;
+      auto &extraInVarSet = info->extraLiveIn;
+      auto &extraOutVarSet = info->extraLiveOut;
+      printf("\n\tin Lives: ");
+      for (auto inVar : inVarSet) {
+        printf("%d ", inVar);
+      }
+      printf("\n\textra in Lives: ");
+      for (auto inVar : extraInVarSet) {
+        printf("%d ", inVar);
+      }
+      printf("\n");
+      printf("\tout Lives: ");
+      for (auto outVar : outVarSet) {
+        printf("%d ", outVar);
+      }
+      printf("\n\textra out Lives: ");
+      for (auto outVar : extraOutVarSet) {
+        printf("%d ", outVar);
+      }
+      printf("\n");
+
+    });
+#endif
+   }
+
 
   /*! To pretty print the livfeness info */
   static const uint32_t prettyInsnStrSize = 48;
diff --git a/backend/src/ir/liveness.hpp b/backend/src/ir/liveness.hpp
index ea5a157..9198eae 100644
--- a/backend/src/ir/liveness.hpp
+++ b/backend/src/ir/liveness.hpp
@@ -24,6 +24,7 @@
 #ifndef __GBE_IR_LIVENESS_HPP__
 #define __GBE_IR_LIVENESS_HPP__
 
+#include <list>
 #include "sys/map.hpp"
 #include "sys/set.hpp"
 #include "ir/register.hpp"
@@ -68,6 +69,8 @@ namespace ir {
       INLINE bool inVarKill(Register reg) const {
         return varKill.contains(reg);
       }
+      UEVar extraLiveIn;
+      LiveOut extraLiveOut;
       UEVar upwardUsed;
       LiveOut liveOut;
       VarKill varKill;
@@ -87,6 +90,23 @@ namespace ir {
       const BlockInfo &info = this->getBlockInfo(bb);
       return info.liveOut;
     }
+    /*! Get the set of registers alive at the beginning of the block */
+    const UEVar &getLiveIn(const BasicBlock *bb) const {
+      const BlockInfo &info = this->getBlockInfo(bb);
+      return info.upwardUsed;
+    }
+
+    /*! Get the set of extra registers alive at the end of the block */
+    const LiveOut &getExtraLiveOut(const BasicBlock *bb) const {
+      const BlockInfo &info = this->getBlockInfo(bb);
+      return info.extraLiveOut;
+    }
+    /*! Get the set of extra registers alive at the beginning of the block */
+    const UEVar &getExtraLiveIn(const BasicBlock *bb) const {
+      const BlockInfo &info = this->getBlockInfo(bb);
+      return info.extraLiveIn;
+    }
+
     /*! Return the function the liveness was computed on */
     INLINE const Function &getFunction(void) const { return fn; }
     /*! Actually do something for each successor / predecessor of *all* blocks */
@@ -119,9 +139,15 @@ namespace ir {
     /*! Initialize UEVar and VarKill per instruction */
     void initInstruction(BlockInfo &info, const Instruction &insn);
     /*! Now really compute LiveOut based on UEVar and VarKill */
-    void computeLiveOut(void);
+    void computeLiveInOut(void);
+    void computeExtraLiveInOut(void);
+    /*! Set of work list block which has exit(return) instruction */
+    typedef set <struct BlockInfo*> WorkSet;
+    WorkSet workSet, extraWorkSet;
+
     /*! Use custom allocators */
     GBE_CLASS(Liveness);
+
   };
 
   /*! Output a nice ASCII reprensation of the liveness */
diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
index 6cccaf5..ad1ea32 100644
--- a/backend/src/ir/lowering.cpp
+++ b/backend/src/ir/lowering.cpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -205,7 +205,7 @@ namespace ir {
         break; \
       } \
     } \
-    if (isDead) { \
+    if (isDead && !dead.contains(WHICH)) { \
       dead.insert(WHICH); \
       WHICH->remove(); \
     } \
@@ -225,29 +225,42 @@ namespace ir {
     for (const auto &loadAddImm : seq) {
       LoadInstruction *load = cast<LoadInstruction>(loadAddImm.load);
       const uint32_t valueNum = load->getValueNum();
+      bool replaced = false;
+      Instruction *ins_after = load; // the instruction to insert after.
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
         const Type type = load->getValueType();
         const RegisterFamily family = getFamily(type);
         const uint32_t size = getFamilySize(family);
         const uint32_t offset = loadAddImm.offset + valueID * size;
         const PushLocation argLocation(*fn, loadAddImm.argID, offset);
-        if (inserted.contains(argLocation))
-          continue;
+        Register pushed;
         const Register reg = load->getValue(valueID);
-        const Register pushed = fn->newRegister(family);
+        if (offset != 0) {
+          if(inserted.contains(argLocation)) {
+            pushed = argLocation.getRegister();
+          } else {
+            pushed = fn->newRegister(family);
+            this->appendPushedConstant(pushed, argLocation);
+            inserted.insert(argLocation);
+          }
+        } else {
+          pushed = fn->getArg(loadAddImm.argID).reg;
+        }
 
         // TODO the MOV instruction can be most of the time avoided if the
         // register is never written. We must however support the register
         // replacement in the instruction interface to be able to patch all the
         // instruction that uses "reg"
-        const Instruction mov = ir::MOV(type, reg, pushed);
-        mov.replace(load);
-        dead.insert(load);
-        this->appendPushedConstant(pushed, argLocation);
+        Instruction mov = ir::MOV(type, reg, pushed);
+        mov.insert(ins_after, &ins_after);
+        replaced = true;
       }
+
+      if (replaced)
+        dead.insert(load);
     }
 
-    // Remove all unused adds and load immediates
+    REMOVE_INSN(load)
     REMOVE_INSN(add)
     REMOVE_INSN(loadImm)
   }
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 10e0c59..ef3ea28 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -40,7 +40,8 @@ namespace ir {
         "stack_pointer",
         "block_ip",
         "barrier_id", "thread_number",
-        "work_dimension", "sampler_info"
+        "work_dimension", "sampler_info",
+        "emask", "notemask", "barriermask", "retVal"
     };
 
 #if GBE_DEBUG
@@ -77,6 +78,10 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, threadn);
       DECL_NEW_REG(FAMILY_DWORD, workdim);
       DECL_NEW_REG(FAMILY_WORD, samplerinfo);
+      DECL_NEW_REG(FAMILY_WORD, emask);
+      DECL_NEW_REG(FAMILY_WORD, notemask);
+      DECL_NEW_REG(FAMILY_WORD, barriermask);
+      DECL_NEW_REG(FAMILY_WORD, retVal);
     }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 89dd69f..d84c48a 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -65,7 +65,11 @@ namespace ir {
     static const Register threadn = Register(21);  // number of threads
     static const Register workdim = Register(22);  // work dimention.
     static const Register samplerinfo = Register(23); // store sampler info.
-    static const uint32_t regNum = 24;             // number of special registers
+    static const Register emask = Register(24);    // store the emask bits for the branching fix.
+    static const Register notemask = Register(25); // store the !emask bits for the branching fix.
+    static const Register barriermask = Register(26); // software mask for barrier.
+    static const Register retVal = Register(27);   // helper register to do data flow analysis.
+    static const uint32_t regNum = 28;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/ir/register.hpp b/backend/src/ir/register.hpp
index 610acb1..4f36c2e 100644
--- a/backend/src/ir/register.hpp
+++ b/backend/src/ir/register.hpp
@@ -47,6 +47,11 @@ namespace ir {
     FAMILY_QWORD = 4
   };
 
+  INLINE char getFamilyName(RegisterFamily family) {
+    static char registerFamilyName[] = {'b', 'B', 'W', 'D', 'Q'};
+    return registerFamilyName[family];
+  }
+
   INLINE uint32_t getFamilySize(RegisterFamily family) {
     switch (family) {
       case FAMILY_BYTE: return 1;
diff --git a/backend/src/ir/sampler.cpp b/backend/src/ir/sampler.cpp
index cff1012..b67c1b7 100644
--- a/backend/src/ir/sampler.cpp
+++ b/backend/src/ir/sampler.cpp
@@ -27,34 +27,23 @@
 namespace gbe {
 namespace ir {
 
-  const uint32_t SamplerSet::getIdx(const Register reg) const
-  {
-    auto it = regMap.find(reg);
-    GBE_ASSERT(it != regMap.end());
-    return it->second.slot;
-  }
-
-  void SamplerSet::appendReg(const Register reg, uint32_t key, Context *ctx) {
-    struct SamplerRegSlot samplerSlot;
-    samplerSlot.reg = reg;
-    samplerSlot.slot = samplerMap.size();
+  uint8_t SamplerSet::appendReg(uint32_t key, Context *ctx) {
+    uint8_t samplerSlot = samplerMap.size();
     samplerMap.insert(std::make_pair(key, samplerSlot));
-    regMap.insert(std::make_pair(samplerSlot.reg, samplerSlot));
+    return samplerSlot;
   }
 
-  Register SamplerSet::append(uint32_t samplerValue, Context *ctx)
+  uint8_t SamplerSet::append(uint32_t samplerValue, Context *ctx)
   {
     auto it = samplerMap.find(samplerValue);
     if (it != samplerMap.end())
-        return it->second.reg;
+        return it->second;
     // This register is just used as a key.
-    Register reg = ctx->reg(FAMILY_DWORD);
-    appendReg(reg, samplerValue, ctx);
-    return reg;
+    return appendReg(samplerValue, ctx);
   }
 
 #define SAMPLER_ID(id) ((id << __CLK_SAMPLER_ARG_BASE) | __CLK_SAMPLER_ARG_KEY_BIT)
-  void SamplerSet::append(Register samplerReg, Context *ctx)
+  uint8_t SamplerSet::append(Register samplerReg, Context *ctx)
   {
     ir::FunctionArgument *arg =  ctx->getFunction().getArg(samplerReg);
     GBE_ASSERT(arg != NULL);
@@ -68,13 +57,11 @@ namespace ir {
 
     auto it = samplerMap.find(SAMPLER_ID(id));
     if (it != samplerMap.end()) {
-      GBE_ASSERT(it->second.reg == samplerReg);
-      return;
+      return it->second;
     }
-    appendReg(samplerReg, SAMPLER_ID(id), ctx);
+    return appendReg(SAMPLER_ID(id), ctx);
   }
 
-
 #define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
 #define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
 
@@ -87,15 +74,7 @@ namespace ir {
     OUT_UPDATE_SZ(samplerMap.size());
     for (auto iter : samplerMap) {
       OUT_UPDATE_SZ(iter.first);
-      OUT_UPDATE_SZ(iter.second.reg);
-      OUT_UPDATE_SZ(iter.second.slot);
-    }
-
-    OUT_UPDATE_SZ(regMap.size());
-    for (auto iter : regMap) {
-      OUT_UPDATE_SZ(iter.first);
-      OUT_UPDATE_SZ(iter.second.reg);
-      OUT_UPDATE_SZ(iter.second.slot);
+      OUT_UPDATE_SZ(iter.second);
     }
 
     OUT_UPDATE_SZ(magic_end);
@@ -116,23 +95,11 @@ namespace ir {
     IN_UPDATE_SZ(sampler_map_sz);
     for (size_t i = 0; i < sampler_map_sz; i++) {
       uint32_t key;
-      ir::SamplerRegSlot reg_slot;
+      uint32_t slot;
 
       IN_UPDATE_SZ(key);
-      IN_UPDATE_SZ(reg_slot.reg);
-      IN_UPDATE_SZ(reg_slot.slot);
-      samplerMap.insert(std::make_pair(key, reg_slot));
-    }
-
-    IN_UPDATE_SZ(sampler_map_sz);
-    for (size_t i = 0; i < sampler_map_sz; i++) {
-      ir::Register key;
-      ir::SamplerRegSlot reg_slot;
-
-      IN_UPDATE_SZ(key);
-      IN_UPDATE_SZ(reg_slot.reg);
-      IN_UPDATE_SZ(reg_slot.slot);
-      regMap.insert(std::make_pair(key, reg_slot));
+      IN_UPDATE_SZ(slot);
+      samplerMap.insert(std::make_pair(key, slot));
     }
 
     IN_UPDATE_SZ(magic);
@@ -159,14 +126,7 @@ namespace ir {
 
     for (auto iter : samplerMap) {
       outs << spaces_nl <<  "     [" << iter.first << ", "
-           << iter.second.reg << ", " << iter.second.slot << "]\n";
-    }
-
-    outs << spaces_nl << "  SamplerSet Map: [reg, sampler_reg, sampler_slot]\n";
-    outs << spaces_nl << "     regMap size: " << regMap.size() << "\n";
-    for (auto iter : regMap) {
-      outs << spaces_nl << "     [" << iter.first << ", "
-           << iter.second.reg << ", " << iter.second.slot << "]\n";
+           << iter.second << "]\n";
     }
 
     outs << spaces << "------------- End SamplerSet -------------" << "\n";
diff --git a/backend/src/ir/sampler.hpp b/backend/src/ir/sampler.hpp
index 3c72e3e..dd1f3b6 100644
--- a/backend/src/ir/sampler.hpp
+++ b/backend/src/ir/sampler.hpp
@@ -36,31 +36,23 @@ namespace ir {
    */
   class Context;
 
-  struct SamplerRegSlot {
-    Register reg;
-    uint32_t slot;
-  };
-
   class SamplerSet : public Serializable
   {
   public:
     /*! Append the specified sampler and return the allocated offset.
      *  If the speficied sampler is exist, only return the previous offset and
      *  don't append it again. Return -1, if failed.*/
-    Register append(uint32_t clkSamplerValue, Context *ctx);
+    uint8_t append(uint32_t clkSamplerValue, Context *ctx);
     /*! Append a sampler defined in kernel args. */
-    void append(Register samplerArg, Context *ctx);
-    /*! Get the sampler idx (actual location) */
-    const uint32_t getIdx(const Register reg) const;
+    uint8_t append(Register samplerArg, Context *ctx);
     size_t getDataSize(void) { return samplerMap.size(); }
     size_t getDataSize(void) const { return samplerMap.size(); }
     void getData(uint32_t *samplers) const {
       for(auto &it : samplerMap)
-        samplers[it.second.slot] = it.first;
+        samplers[it.second] = it.first;
     }
 
     void operator = (const SamplerSet& other) {
-      regMap.insert(other.regMap.begin(), other.regMap.end());
       samplerMap.insert(other.samplerMap.begin(), other.samplerMap.end());
     }
 
@@ -90,9 +82,8 @@ namespace ir {
     virtual void printStatus(int indent, std::ostream& outs);
 
   private:
-    void appendReg(const Register reg, uint32_t key, Context *ctx);
-    map<uint32_t, SamplerRegSlot> samplerMap;
-    map<Register, SamplerRegSlot> regMap;
+    uint8_t appendReg(uint32_t key, Context *ctx);
+    map<uint32_t, uint32_t> samplerMap;
     GBE_CLASS(SamplerSet);
   };
 } /* namespace ir */
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 62d6eab..2d4fb0a 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -105,7 +105,13 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+#include "llvm/IR/Mangler.h"
+#else
 #include "llvm/Target/Mangler.h"
+#endif
+
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -154,8 +160,8 @@
 #define LLVM_VERSION_MINOR 0
 #endif /* !defined(LLVM_VERSION_MINOR) */
 
-#if (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 4)
-#error "Only LLVM 3.0 - 3.4 is supported"
+#if (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR < 3)
+#error "Only LLVM 3.3 and newer are supported"
 #endif /* (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 4) */
 
 using namespace llvm;
@@ -555,7 +561,7 @@ namespace gbe
     // Emit unary instructions from gen native function
     void emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode);
 
-    ir::Register appendSampler(CallSite::arg_iterator AI);
+    uint8_t appendSampler(CallSite::arg_iterator AI);
 
     // These instructions are not supported at all
     void visitVAArgInst(VAArgInst &I) {NOT_SUPPORTED;}
@@ -681,6 +687,7 @@ namespace gbe
     const Module::GlobalListType &globalList = TheModule->getGlobalList();
     for(auto i = globalList.begin(); i != globalList.end(); i ++) {
       const GlobalVariable &v = *i;
+      if(!v.isConstantUsed()) continue;
       const char *name = v.getName().data();
       unsigned addrSpace = v.getType()->getAddressSpace();
       if(addrSpace == ir::AddressSpace::MEM_CONSTANT) {
@@ -692,7 +699,8 @@ namespace gbe
         void* mem = malloc(size);
         uint32_t offset = 0;
         getConstantData(c, mem, offset);
-        unit.newConstant((char *)mem, name, size, sizeof(unsigned));
+        uint32_t alignment = getAlignmentByte(unit, type);
+        unit.newConstant((char *)mem, name, size, alignment);
         free(mem);
       }
     }
@@ -740,7 +748,7 @@ namespace gbe
         const float f32 = seq->getElementAsFloat(index);
         return doIt(f32);
       } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
-        const float f64 = seq->getElementAsDouble(index);
+        const double f64 = seq->getElementAsDouble(index);
         return doIt(f64);
       }
     } else
@@ -769,7 +777,7 @@ namespace gbe
         const float f32 = 0;
         return doIt(f32);
       } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
-        const float f64 = 0;
+        const double f64 = 0;
         return doIt(f64);
       } else {
         GBE_ASSERTM(false, "Unsupporte aggregate zero type.");
@@ -879,6 +887,22 @@ namespace gbe
     if(isa<GlobalValue>(c)) {
       return regTranslator.getScalar(c, elemID);
     }
+    if(isa<UndefValue>(c)) {
+      Type* llvmType = c->getType();
+      ir::Type dstType = getType(ctx, llvmType);
+      ir::Register reg = ctx.reg(getFamily(dstType));
+
+      ir::ImmediateIndex immIndex;
+      if(llvmType->isIntegerTy())
+        immIndex = ctx.newIntegerImmediate(0, dstType);
+      else if(llvmType->isFloatTy()) {
+        immIndex = ctx.newFloatImmediate((float)0.0);
+      } else {
+        immIndex = ctx.newDoubleImmediate((double)0.0);
+      }
+      ctx.LOADI(dstType, reg, immIndex);
+      return reg;
+    }
 
     if(isa<ConstantExpr>(c)) {
       ConstantExpr * ce = dyn_cast<ConstantExpr>(c);
@@ -905,27 +929,37 @@ namespace gbe
       } else {
         uint32_t TypeIndex;
         uint32_t constantOffset = 0;
-        uint32_t offset = 0;
 
         // currently only GetElementPtr is handled
         GBE_ASSERT(ce->getOpcode() == Instruction::GetElementPtr);
         Value *pointer = ce->getOperand(0);
         CompositeType* CompTy = cast<CompositeType>(pointer->getType());
         for(uint32_t op=1; op<ce->getNumOperands(); ++op) {
+          uint32_t offset = 0;
           ConstantInt* ConstOP = dyn_cast<ConstantInt>(ce->getOperand(op));
           GBE_ASSERT(ConstOP);
           TypeIndex = ConstOP->getZExtValue();
-          for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
-          {
-            Type* elementType = CompTy->getTypeAtIndex(ty_i);
-            uint32_t align = getAlignmentByte(unit, elementType);
+          if (op == 1) {
+            if (TypeIndex != 0) {
+              Type *elementType = (cast<PointerType>(pointer->getType()))->getElementType();
+              uint32_t elementSize = getTypeByteSize(unit, elementType);
+              uint32_t align = getAlignmentByte(unit, elementType);
+              elementSize += getPadding(elementSize, align);
+              offset += elementSize * TypeIndex;
+            }
+          } else {
+            for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
+            {
+              Type* elementType = CompTy->getTypeAtIndex(ty_i);
+              uint32_t align = getAlignmentByte(unit, elementType);
+              offset += getPadding(offset, align);
+              offset += getTypeByteSize(unit, elementType);
+            }
+
+            const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
             offset += getPadding(offset, align);
-            offset += getTypeByteSize(unit, elementType);
           }
 
-          const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
-          offset += getPadding(offset, align);
-
           constantOffset += offset;
           CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
         }
@@ -1008,8 +1042,6 @@ namespace gbe
       Value *IV = PN->getIncomingValueForBlock(curr);
       if (!isa<UndefValue>(IV)) {
         Type *llvmType = PN->getType();
-        GBE_ASSERTM(llvmType != Type::getInt1Ty(llvmType->getContext()),
-          "TODO Boolean values cannot escape their definition basic block");
         const ir::Type type = getType(ctx, llvmType);
 
         // Emit the MOV required by the PHI function. We do it simple and do not
@@ -1039,6 +1071,33 @@ namespace gbe
   {
     GBE_ASSERTM(F.hasStructRetAttr() == false,
                 "Returned value for kernel functions is forbidden");
+
+    // Loop over the kernel metadatas to set the required work group size.
+    NamedMDNode *clKernelMetaDatas = TheModule->getNamedMetadata("opencl.kernels");
+    size_t reqd_wg_sz[3] = {0, 0, 0};
+    for(uint i = 0; i < clKernelMetaDatas->getNumOperands(); i++)
+    {
+      MDNode *node = clKernelMetaDatas->getOperand(i);
+      if (node->getOperand(0) != &F) continue;
+      for(uint j = 0; j < node->getNumOperands() - 1; j++)
+      {
+        MDNode *attrNode = dyn_cast_or_null<MDNode>(node->getOperand(1 + j));
+        if (attrNode == NULL) break;
+        MDString *attrName = dyn_cast_or_null<MDString>(attrNode->getOperand(0));
+        if (attrName && attrName->getString() == "reqd_work_group_size") {
+          GBE_ASSERT(attrNode->getNumOperands() == 4);
+          ConstantInt *x = dyn_cast<ConstantInt>(attrNode->getOperand(1));
+          ConstantInt *y = dyn_cast<ConstantInt>(attrNode->getOperand(2));
+          ConstantInt *z = dyn_cast<ConstantInt>(attrNode->getOperand(3));
+          GBE_ASSERT(x && y && z);
+          reqd_wg_sz[0] = x->getZExtValue();
+          reqd_wg_sz[1] = y->getZExtValue();
+          reqd_wg_sz[2] = z->getZExtValue();
+          break;
+        }
+      }
+    }
+    ctx.getFunction().setCompileWorkGroupSize(reqd_wg_sz[0], reqd_wg_sz[1], reqd_wg_sz[2]);
     // Loop over the arguments and output registers for them
     if (!F.arg_empty()) {
       uint32_t argID = 0;
@@ -1063,7 +1122,7 @@ namespace gbe
           const uint32_t elemSize = getTypeByteSize(unit, elemType);
           const uint32_t elemNum = vectorType->getNumElements();
           //vector's elemType always scalar type
-          ctx.input(argName, ir::FunctionArgument::VALUE, reg, elemNum*elemSize);
+          ctx.input(argName, ir::FunctionArgument::VALUE, reg, elemNum*elemSize, getAlignmentByte(unit, type));
 
           ir::Function& fn = ctx.getFunction();
           for(uint32_t i=1; i < elemNum; i++) {
@@ -1078,37 +1137,38 @@ namespace gbe
                     "vector type in the function argument is not supported yet");
         const ir::Register reg = regTranslator.newScalar(I);
         if (type->isPointerTy() == false)
-          ctx.input(argName, ir::FunctionArgument::VALUE, reg, getTypeByteSize(unit, type));
+          ctx.input(argName, ir::FunctionArgument::VALUE, reg, getTypeByteSize(unit, type), getAlignmentByte(unit, type));
         else {
           PointerType *pointerType = dyn_cast<PointerType>(type);
+          Type *pointed = pointerType->getElementType();
           // By value structure
 #if LLVM_VERSION_MINOR <= 1
           if (PAL.paramHasAttr(argID+1, Attribute::ByVal)) {
 #else
           if (I->hasByValAttr()) {
 #endif /* LLVM_VERSION_MINOR <= 1 */
-            Type *pointed = pointerType->getElementType();
             const size_t structSize = getTypeByteSize(unit, pointed);
-            ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, structSize);
+            ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, structSize, getAlignmentByte(unit, type));
           }
           // Regular user provided pointer (global, local or constant)
           else {
             const uint32_t addr = pointerType->getAddressSpace();
             const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(addr);
             const uint32_t ptrSize = getTypeByteSize(unit, type);
+            const uint32_t align = getAlignmentByte(unit, pointed);
               switch (addrSpace) {
               case ir::MEM_GLOBAL:
-                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, ptrSize);
+                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, ptrSize, align);
               break;
               case ir::MEM_LOCAL:
-                ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg, ptrSize);
+                ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg, ptrSize, align);
                 ctx.getFunction().setUseSLM(true);
               break;
               case ir::MEM_CONSTANT:
-                ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg, ptrSize);
+                ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg, ptrSize, align);
               break;
               case ir::IMAGE:
-                ctx.input(argName, ir::FunctionArgument::IMAGE, reg, ptrSize);
+                ctx.input(argName, ir::FunctionArgument::IMAGE, reg, ptrSize, align);
                 ctx.getFunction().getImageSet()->append(reg, &ctx);
               break;
               default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE);
@@ -1301,7 +1361,7 @@ namespace gbe
         // substitutions (if any)
         else {
           const uint32_t srcNum = insn.getSrcNum();
-          const uint32_t dstNum = insn.getSrcNum();
+          const uint32_t dstNum = insn.getDstNum();
           for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
             const ir::Register src = insn.getSrc(srcID);
             auto it = immTranslate.find(src);
@@ -1309,7 +1369,7 @@ namespace gbe
               insn.setSrc(srcID, it->second);
           }
           for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
-            const ir::Register dst = insn.getSrc(dstID);
+            const ir::Register dst = insn.getDst(dstID);
             auto it = immTranslate.find(dst);
             if (it != immTranslate.end())
               insn.setDst(dstID, it->second);
@@ -1343,15 +1403,19 @@ namespace gbe
         uint32_t padding = getPadding(oldSlm*8, align);
 
         f.setSLMSize(oldSlm + padding/8 + getTypeByteSize(unit, ty));
-        const Value * parent = cast<Value>(&v);
+        const Value * val = cast<Value>(&v);
         // local variable can only be used in one kernel function. so, don't need to check its all uses.
         // loop through the Constant to find the instruction that use the global variable
-        do {
-          Value::const_use_iterator it = parent->use_begin();
-          parent = cast<Value>(*it);
-        } while(isa<Constant>(parent));
-
-        const Instruction * insn = cast<Instruction>(parent);
+        // FIXME need to find a more grace way to find the function which use this local data.
+        const Instruction * insn = NULL;
+        for( Value::const_use_iterator it = val->use_begin(), prev = val->use_begin();
+             it != prev->use_end() && insn == NULL;
+             prev = it, it = it->use_begin() )
+          for( Value::const_use_iterator innerIt = it;
+               innerIt != val->use_end() && insn == NULL;
+               innerIt++)
+            insn = dyn_cast<Instruction>(*innerIt);
+        GBE_ASSERT(insn && "Can't find a valid reference instruction for local variable.");
         const BasicBlock * bb = insn->getParent();
         const Function * func = bb->getParent();
         if(func != &F) continue;
@@ -1364,6 +1428,7 @@ namespace gbe
         this->newRegister(const_cast<GlobalVariable*>(&v));
         ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
         ir::Constant &con = unit.getConstantSet().getConstant(j ++);
+        GBE_ASSERT(con.getName() == v.getName());
         ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
       } else {
         GBE_ASSERT(0);
@@ -1561,25 +1626,54 @@ namespace gbe
     // Get the element type and the number of elements
     Type *operandType = I.getOperand(0)->getType();
     const ir::Type type = getType(ctx, operandType);
+    const ir::Type insnType = getType(ctx, I.getType());
 
     // Emit the instructions in a row
     const ir::Register dst = this->getRegister(&I);
     const ir::Register src0 = this->getRegister(I.getOperand(0));
     const ir::Register src1 = this->getRegister(I.getOperand(1));
+    const ir::Register tmp = ctx.reg(getFamily(ctx, I.getType()));
+    Value *cv = ConstantInt::get(I.getType(), 1);
 
     switch (I.getPredicate()) {
-      case ICmpInst::FCMP_OEQ:
-      case ICmpInst::FCMP_UEQ: ctx.EQ(type, dst, src0, src1); break;
-      case ICmpInst::FCMP_ONE:
-      case ICmpInst::FCMP_UNE: ctx.NE(type, dst, src0, src1); break;
-      case ICmpInst::FCMP_OLE:
-      case ICmpInst::FCMP_ULE: ctx.LE(type, dst, src0, src1); break;
-      case ICmpInst::FCMP_OGE:
-      case ICmpInst::FCMP_UGE: ctx.GE(type, dst, src0, src1); break;
-      case ICmpInst::FCMP_OLT:
-      case ICmpInst::FCMP_ULT: ctx.LT(type, dst, src0, src1); break;
-      case ICmpInst::FCMP_OGT:
-      case ICmpInst::FCMP_UGT: ctx.GT(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_OEQ: ctx.EQ(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_ONE: ctx.NE(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_OLE: ctx.LE(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_OGE: ctx.GE(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_OLT: ctx.LT(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_OGT: ctx.GT(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_ORD: ctx.ORD(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_UNO:
+        ctx.ORD(type, tmp, src0, src1);
+        ctx.XOR(insnType, dst, tmp, getRegister(cv));  //TODO: Use NOT directly
+        break;
+      case ICmpInst::FCMP_UEQ:
+        ctx.NE(type, tmp, src0, src1);
+        ctx.XOR(insnType, dst, tmp, getRegister(cv));
+        break;
+      case ICmpInst::FCMP_UGT:
+        ctx.LE(type, tmp, src0, src1);
+        ctx.XOR(insnType, dst, tmp, getRegister(cv));
+        break;
+      case ICmpInst::FCMP_UGE:
+        ctx.LT(type, tmp, src0, src1);
+        ctx.XOR(insnType, dst, tmp, getRegister(cv));
+        break;
+      case ICmpInst::FCMP_ULT:
+        ctx.GE(type, tmp, src0, src1);
+        ctx.XOR(insnType, dst, tmp, getRegister(cv));
+        break;
+      case ICmpInst::FCMP_ULE:
+        ctx.GT(type, tmp, src0, src1);
+        ctx.XOR(insnType, dst, tmp, getRegister(cv));
+        break;
+      case ICmpInst::FCMP_UNE:
+        ctx.EQ(type, tmp, src0, src1);
+        ctx.XOR(insnType, dst, tmp, getRegister(cv));
+        break;
+      case ICmpInst::FCMP_TRUE:
+        ctx.MOV(insnType, dst, getRegister(cv));
+        break;
       default: NOT_SUPPORTED;
     }
   }
@@ -1610,7 +1704,13 @@ namespace gbe
       // Bitcast just forward registers
       case Instruction::BitCast:
       {
-        regTranslator.newValueProxy(srcValue, dstValue);
+        Type *srcType = srcValue->getType();
+        Type *dstType = dstValue->getType();
+
+        if(srcType->isVectorTy() || dstType->isVectorTy())
+          this->newRegister(dstValue);
+        else
+          regTranslator.newValueProxy(srcValue, dstValue);
       }
       break;
       // Various conversion operations -> just allocate registers for them
@@ -1646,7 +1746,36 @@ namespace gbe
         }
       }
       break;
-      case Instruction::BitCast: break; // nothing to emit here
+      case Instruction::BitCast:
+      {
+        Value *srcValue = I.getOperand(0);
+        Value *dstValue = &I;
+        uint32_t srcElemNum = 0, dstElemNum = 0 ;
+        ir::Type srcType = getVectorInfo(ctx, srcValue->getType(), srcValue, srcElemNum);
+        ir::Type dstType = getVectorInfo(ctx, dstValue->getType(), dstValue, dstElemNum);
+        if(srcElemNum > 1 || dstElemNum > 1) {
+          // Build the tuple data in the vector
+          vector<ir::Register> srcTupleData;
+          vector<ir::Register> dstTupleData;
+          uint32_t elemID = 0;
+          for (elemID = 0; elemID < srcElemNum; ++elemID) {
+            ir::Register reg;
+            reg = this->getRegister(srcValue, elemID);
+            srcTupleData.push_back(reg);
+          }
+          for (elemID = 0; elemID < dstElemNum; ++elemID) {
+            ir::Register reg;
+            reg = this->getRegister(dstValue, elemID);
+            dstTupleData.push_back(reg);
+          }
+
+          const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], srcElemNum);
+          const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], dstElemNum);
+
+          ctx.BITCAST(dstType, srcType, dstTuple, srcTuple, dstElemNum, srcElemNum);
+        }
+      }
+      break; // nothing to emit here
       case Instruction::FPToUI:
       case Instruction::FPToSI:
       case Instruction::SIToFP:
@@ -1660,7 +1789,11 @@ namespace gbe
         // Get the element type for a vector
         Type *llvmDstType = I.getType();
         Type *llvmSrcType = I.getOperand(0)->getType();
-        const ir::Type dstType = getType(ctx, llvmDstType);
+        ir::Type dstType;
+        if (I.getOpcode() == Instruction::FPToUI)
+          dstType = getUnsignedType(ctx, llvmDstType);
+        else
+          dstType = getType(ctx, llvmDstType);
         ir::Type srcType;
         if (I.getOpcode() == Instruction::ZExt || I.getOpcode() == Instruction::UIToFP) {
           srcType = getUnsignedType(ctx, llvmSrcType);
@@ -1674,7 +1807,7 @@ namespace gbe
           const ir::ImmediateIndex zero = ctx.newIntegerImmediate(0, dstType);
           ir::ImmediateIndex one;
           if (I.getOpcode() == Instruction::SExt
-              && (dstType == ir::TYPE_S8 || dstType == ir::TYPE_S16 || dstType == ir::TYPE_S32))
+              && (dstType == ir::TYPE_S8 || dstType == ir::TYPE_S16 || dstType == ir::TYPE_S32 || dstType == ir::TYPE_S64))
             one = ctx.newIntegerImmediate(-1, dstType);
           else
             one = ctx.newIntegerImmediate(1, dstType);
@@ -1729,8 +1862,7 @@ namespace gbe
     }
   }
 
-  void GenWriter::regAllocateExtractElement(ExtractElementInst &I) {}
-  void GenWriter::emitExtractElement(ExtractElementInst &I) {
+  void GenWriter::regAllocateExtractElement(ExtractElementInst &I) {
     Value *vec = I.getVectorOperand();
     const Value *index = I.getIndexOperand();
     const ConstantInt *c = dyn_cast<ConstantInt>(index);
@@ -1739,6 +1871,9 @@ namespace gbe
     regTranslator.newValueProxy(vec, &I, i, 0);
   }
 
+  void GenWriter::emitExtractElement(ExtractElementInst &I) {
+  }
+
   void GenWriter::regAllocateShuffleVectorInst(ShuffleVectorInst &I) {}
   void GenWriter::emitShuffleVectorInst(ShuffleVectorInst &I) {}
 
@@ -1906,6 +2041,7 @@ namespace gbe
       case GEN_OCL_SQR:
       case GEN_OCL_RSQ:
       case GEN_OCL_LOG:
+      case GEN_OCL_EXP:
       case GEN_OCL_POW:
       case GEN_OCL_RCP:
       case GEN_OCL_ABS:
@@ -1996,6 +2132,7 @@ namespace gbe
       case GEN_OCL_UPSAMPLE_SHORT:
       case GEN_OCL_UPSAMPLE_INT:
       case GEN_OCL_UPSAMPLE_LONG:
+      case GEN_OCL_MAD:
       case GEN_OCL_SADD_SAT_CHAR:
       case GEN_OCL_SADD_SAT_SHORT:
       case GEN_OCL_SADD_SAT_INT:
@@ -2042,6 +2179,8 @@ namespace gbe
       case GEN_OCL_SAT_CONV_F32_TO_I32:
       case GEN_OCL_SAT_CONV_I32_TO_U32:
       case GEN_OCL_SAT_CONV_F32_TO_U32:
+      case GEN_OCL_CONV_F16_TO_F32:
+      case GEN_OCL_CONV_F32_TO_F16:
         this->newRegister(&I);
         break;
       default:
@@ -2089,21 +2228,22 @@ namespace gbe
 
   /* append a new sampler. should be called before any reference to
    * a sampler_t value. */
-  ir::Register GenWriter::appendSampler(CallSite::arg_iterator AI) {
+  uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) {
     Constant *CPV = dyn_cast<Constant>(*AI);
-    ir::Register sampler;
+    uint8_t index;
     if (CPV != NULL)
     {
       // This is not a kernel argument sampler, we need to append it to sampler set,
       // and allocate a sampler slot for it.
       auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
-      GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
-      sampler = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx);
+      GBE_ASSERTM(x.type == ir::TYPE_U16 || x.type == ir::TYPE_S16, "Invalid sampler type");
+
+      index = ctx.getFunction().getSamplerSet()->append(x.data.u32, &ctx);
     } else {
-      sampler = this->getRegister(*AI);
-      ctx.getFunction().getSamplerSet()->append(sampler, &ctx);
+      const ir::Register samplerReg = this->getRegister(*AI);
+      index = ctx.getFunction().getSamplerSet()->append(samplerReg, &ctx);
     }
-    return sampler;
+    return index;
   }
 
   void GenWriter::emitCallInst(CallInst &I) {
@@ -2181,6 +2321,7 @@ namespace gbe
           case GEN_OCL_COS: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
           case GEN_OCL_SIN: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
           case GEN_OCL_LOG: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
+          case GEN_OCL_EXP: this->emitUnaryCallInst(I,CS,ir::OP_EXP); break;
           case GEN_OCL_SQR: this->emitUnaryCallInst(I,CS,ir::OP_SQR); break;
           case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
           case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
@@ -2226,21 +2367,21 @@ namespace gbe
           case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
           case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
           {
-            GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI;
-            uint32_t elemNum;
-            (void)getVectorInfo(ctx, I.getType(), &I, elemNum);
+            GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
             const ir::Register reg = this->getRegister(&I, 0);
             int infoType = it->second - GEN_OCL_GET_IMAGE_WIDTH;
-
-            ctx.GET_IMAGE_INFO(infoType, reg, surface_id, ctx.reg(ir::FAMILY_DWORD));
+            const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
+            ir::ImageInfoKey key(surfaceID, infoType);
+            const ir::Register infoReg = ctx.getFunction().getImageSet()->appendInfo(key, &ctx);
+            ctx.GET_IMAGE_INFO(infoType, reg, surfaceID, infoReg);
             break;
           }
           case GEN_OCL_GET_SAMPLER_INFO:
           {
             GBE_ASSERT(AI != AE);
-            const ir::Register sampler = this->appendSampler(AI); ++AI;
+            const uint8_t index = this->appendSampler(AI); ++AI;
             const ir::Register reg = this->getRegister(&I, 0);
-            ctx.GET_SAMPLER_INFO(reg, sampler);
+            ctx.GET_SAMPLER_INFO(reg, ir::ocl::samplerinfo, index);
             break;
           }
           case GEN_OCL_READ_IMAGE0:
@@ -2256,18 +2397,21 @@ namespace gbe
           case GEN_OCL_READ_IMAGE14:
           case GEN_OCL_READ_IMAGE15:
           {
-            GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
+            const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
             GBE_ASSERT(AI != AE);
-            const ir::Register sampler = this->appendSampler(AI);
+            const uint8_t sampler = this->appendSampler(AI);
             ++AI;
 
             GBE_ASSERT(AI != AE); const ir::Register ucoord = this->getRegister(*AI); ++AI;
             GBE_ASSERT(AI != AE); const ir::Register vcoord = this->getRegister(*AI); ++AI;
             ir::Register wcoord;
+            bool is3D = false;
             if (it->second >= GEN_OCL_READ_IMAGE10 && it->second <= GEN_OCL_READ_IMAGE15) {
               GBE_ASSERT(AI != AE); wcoord = this->getRegister(*AI); ++AI;
+              is3D = true;
             } else
-              wcoord = ir::Register(0);
+              wcoord = ucoord; // not used, just a padding.
 
             vector<ir::Register> dstTupleData, srcTupleData;
             const uint32_t elemNum = 4;
@@ -2275,24 +2419,19 @@ namespace gbe
               const ir::Register reg = this->getRegister(&I, elemID);
               dstTupleData.push_back(reg);
             }
-            srcTupleData.push_back(surface_id);
-            srcTupleData.push_back(sampler);
             srcTupleData.push_back(ucoord);
             srcTupleData.push_back(vcoord);
             srcTupleData.push_back(wcoord);
+            uint8_t samplerOffset = 0;
 #ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
             GBE_ASSERT(AI != AE); Constant *CPV = dyn_cast<Constant>(*AI);
             assert(CPV);
             auto x = processConstant<ir::Immediate>(CPV, InsertExtractFunctor(ctx));
             GBE_ASSERTM(x.type == ir::TYPE_U32 || x.type == ir::TYPE_S32, "Invalid sampler type");
-            ir::Register offsetReg(x.data.u32);
-            srcTupleData.push_back(offsetReg);
-#else
-            ir::Register offsetReg(0);
+            samplerOffset = x.data.u32;
 #endif
-            srcTupleData.push_back(offsetReg);
             const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], elemNum);
-            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 6);
+            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 3);
 
             ir::Type srcType = ir::TYPE_S32, dstType = ir::TYPE_U32;
 
@@ -2324,7 +2463,8 @@ namespace gbe
                 GBE_ASSERT(0); // never been here.
             }
 
-            ctx.SAMPLE(dstTuple, srcTuple, dstType, srcType);
+            ctx.SAMPLE(surfaceID, dstTuple, srcTuple, dstType == ir::TYPE_FLOAT,
+                       srcType == ir::TYPE_FLOAT, sampler, samplerOffset, is3D);
             break;
           }
           case GEN_OCL_WRITE_IMAGE0:
@@ -2340,18 +2480,20 @@ namespace gbe
           case GEN_OCL_WRITE_IMAGE14:
           case GEN_OCL_WRITE_IMAGE15:
           {
-            GBE_ASSERT(AI != AE); const ir::Register surface_id = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
+            const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
             GBE_ASSERT(AI != AE); const ir::Register ucoord = this->getRegister(*AI); ++AI;
             GBE_ASSERT(AI != AE); const ir::Register vcoord = this->getRegister(*AI); ++AI;
             ir::Register wcoord;
+            bool is3D = false;
             if(it->second >= GEN_OCL_WRITE_IMAGE10 && it->second <= GEN_OCL_WRITE_IMAGE15) {
               GBE_ASSERT(AI != AE); wcoord = this->getRegister(*AI); ++AI;
+              is3D = true;
             } else
-              wcoord = ir::Register(0);
+              wcoord = ucoord; // not used, just padding.
             GBE_ASSERT(AI != AE);
             vector<ir::Register> srcTupleData;
 
-            srcTupleData.push_back(surface_id);
             srcTupleData.push_back(ucoord);
             srcTupleData.push_back(vcoord);
             srcTupleData.push_back(wcoord);
@@ -2361,7 +2503,7 @@ namespace gbe
               const ir::Register reg = this->getRegister(*AI, elemID);
               srcTupleData.push_back(reg);
             }
-            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 8);
+            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 7);
 
             ir::Type srcType = ir::TYPE_U32, coordType = ir::TYPE_U32;
 
@@ -2392,7 +2534,7 @@ namespace gbe
                 GBE_ASSERT(0); // never been here.
             }
 
-            ctx.TYPED_WRITE(srcTuple, srcType, coordType);
+            ctx.TYPED_WRITE(surfaceID, srcTuple, srcType, coordType, is3D);
             break;
           }
           case GEN_OCL_MUL_HI_INT:
@@ -2513,6 +2655,14 @@ namespace gbe
             ctx.I64MADSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1, src2);
             break;
            }
+          case GEN_OCL_MAD: {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.MAD(getType(ctx, I.getType()), dst, src0, src1, src2);
+            break;
+          }
           case GEN_OCL_HADD: {
             GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
             GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
@@ -2597,6 +2747,12 @@ namespace gbe
             DEF(ir::TYPE_U32, ir::TYPE_S32);
           case GEN_OCL_SAT_CONV_F32_TO_U32:
             DEF(ir::TYPE_U32, ir::TYPE_FLOAT);
+          case GEN_OCL_CONV_F16_TO_F32:
+            ctx.F16TO32(ir::TYPE_FLOAT, ir::TYPE_U16, getRegister(&I), getRegister(I.getOperand(0)));
+            break;
+          case GEN_OCL_CONV_F32_TO_F16:
+            ctx.F32TO16(ir::TYPE_U16, ir::TYPE_FLOAT, getRegister(&I), getRegister(I.getOperand(0)));
+            break;
 #undef DEF
           default: break;
         }
@@ -2611,32 +2767,23 @@ namespace gbe
     Value *src = I.getOperand(0);
     Type *elemType = I.getType()->getElementType();
     ir::ImmediateIndex immIndex;
-    bool needMultiply = true;
+    uint32_t elementSize = getTypeByteSize(unit, elemType);
 
     // Be aware, we manipulate pointers
     if (ctx.getPointerSize() == ir::POINTER_32_BITS)
-      immIndex = ctx.newImmediate(uint32_t(getTypeByteSize(unit, elemType)));
+      immIndex = ctx.newImmediate(uint32_t(elementSize));
     else
-      immIndex = ctx.newImmediate(uint64_t(getTypeByteSize(unit, elemType)));
+      immIndex = ctx.newImmediate(uint64_t(elementSize));
 
     // OK, we try to see if we know compile time the size we need to allocate
-    if (I.isArrayAllocation() == false) // one element allocated only
-      needMultiply = false;
-    else {
+    if (I.isArrayAllocation() == true) {
       Constant *CPV = dyn_cast<Constant>(src);
-      if (CPV) {
-        const uint64_t elemNum = processConstant<uint64_t>(CPV, U64CPVExtractFunctor(ctx));
-        ir::Immediate imm = ctx.getImmediate(immIndex);
-        imm.data.u64 = ALIGN(imm.data.u64 * elemNum, 4);
-        ctx.setImmediate(immIndex, imm);
-        needMultiply = false;
-      } else {
-        // Brutal but cheap way to get arrays aligned on 4 bytes: we just align
-        // the element on 4 bytes!
-        ir::Immediate imm = ctx.getImmediate(immIndex);
-        imm.data.u64 = ALIGN(imm.data.u64, 4);
-        ctx.setImmediate(immIndex, imm);
-      }
+      GBE_ASSERT(CPV);
+      const uint64_t elemNum = processConstant<uint64_t>(CPV, U64CPVExtractFunctor(ctx));
+      ir::Immediate imm = ctx.getImmediate(immIndex);
+      imm.data.u64 = ALIGN(imm.data.u64 * elemNum, 4);
+      elementSize *= elemNum;
+      ctx.setImmediate(immIndex, imm);
     }
 
     // Now emit the stream of instructions to get the allocated pointer
@@ -2645,21 +2792,28 @@ namespace gbe
     const ir::Register stack = ir::ocl::stackptr;
     const ir::Register reg = ctx.reg(pointerFamily);
     const ir::Immediate imm = ctx.getImmediate(immIndex);
-
+    uint32_t align = getAlignmentByte(unit, elemType);
+    // below code assume align is power of 2
+    GBE_ASSERT(align && (align & (align-1)) == 0);
+
+    // align the stack pointer according to data alignment
+    if(align > 1) {
+      uint32_t prevStackPtr = ctx.getFunction().getStackSize();
+      uint32_t step = ((prevStackPtr + (align - 1)) & ~(align - 1)) - prevStackPtr;
+      if (step != 0) {
+        ir::ImmediateIndex stepImm = ctx.newIntegerImmediate(step, ir::TYPE_U32);
+        ir::Register stepReg = ctx.reg(ctx.getPointerFamily());
+        ctx.LOADI(ir::TYPE_S32, stepReg, stepImm);
+        ctx.ADD(ir::TYPE_U32, stack, stack, stepReg);
+        ctx.getFunction().pushStackSize(step);
+      }
+    }
     // Set the destination register properly
     ctx.MOV(imm.type, dst, stack);
 
-    // Easy case, we just increment the stack pointer
-    if (needMultiply == false) {
-      ctx.LOADI(imm.type, reg, immIndex);
-      ctx.ADD(imm.type, stack, stack, reg);
-    }
-    // Harder case (variable length array) that requires a multiply
-    else {
-      ctx.LOADI(imm.type, reg, immIndex);
-      ctx.MUL(imm.type, reg, this->getRegister(src), reg);
-      ctx.ADD(imm.type, stack, stack, reg);
-    }
+    ctx.LOADI(imm.type, reg, immIndex);
+    ctx.ADD(imm.type, stack, stack, reg);
+    ctx.getFunction().pushStackSize(elementSize);
   }
 
   static INLINE Value *getLoadOrStoreValue(LoadInst &I) {
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index 55079f5..389d5f3 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -84,8 +84,12 @@ namespace gbe
   /*! Remove the GEP instructions */
   llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit);
 
+  /*! Scalarize all vector op instructions */
   llvm::FunctionPass* createScalarizePass();
 
+  /*! Convert the Intrinsic call to gen function */
+  llvm::BasicBlockPass *createIntrinsicLoweringPass();
+
 } /* namespace gbe */
 
 #endif /* __GBE_LLVM_GEN_BACKEND_HPP__ */
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 3f44be8..de2890c 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -25,12 +25,14 @@ DECL_LLVM_GEN_FUNCTION(SIN, __gen_ocl_sin)
 DECL_LLVM_GEN_FUNCTION(SQR, __gen_ocl_sqrt)
 DECL_LLVM_GEN_FUNCTION(RSQ, __gen_ocl_rsqrt)
 DECL_LLVM_GEN_FUNCTION(LOG, __gen_ocl_log)
+DECL_LLVM_GEN_FUNCTION(EXP, __gen_ocl_exp)
 DECL_LLVM_GEN_FUNCTION(POW, __gen_ocl_pow)
 DECL_LLVM_GEN_FUNCTION(RCP, __gen_ocl_rcp)
 DECL_LLVM_GEN_FUNCTION(RNDZ, __gen_ocl_rndz)
 DECL_LLVM_GEN_FUNCTION(RNDE, __gen_ocl_rnde)
 DECL_LLVM_GEN_FUNCTION(RNDU, __gen_ocl_rndu)
 DECL_LLVM_GEN_FUNCTION(RNDD, __gen_ocl_rndd)
+DECL_LLVM_GEN_FUNCTION(MAD, __gen_ocl_mad)
 
 // Barrier function
 DECL_LLVM_GEN_FUNCTION(LBARRIER, __gen_ocl_barrier_local)
@@ -42,19 +44,19 @@ DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8,  __gen_ocl_force_simd8)
 DECL_LLVM_GEN_FUNCTION(FORCE_SIMD16, __gen_ocl_force_simd16)
 
 // To read_image functions.
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE0, _Z21__gen_ocl_read_imageijjiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE1, _Z21__gen_ocl_read_imageijjffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE2, _Z22__gen_ocl_read_imageuijjiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE3, _Z22__gen_ocl_read_imageuijjffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE4, _Z21__gen_ocl_read_imagefjjiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE5, _Z21__gen_ocl_read_imagefjjffj)
-
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE10, _Z21__gen_ocl_read_imageijjiiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE11, _Z21__gen_ocl_read_imageijjfffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE12, _Z22__gen_ocl_read_imageuijjiiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE13, _Z22__gen_ocl_read_imageuijjfffj)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE14, _Z21__gen_ocl_read_imagefjjiiij)
-DECL_LLVM_GEN_FUNCTION(READ_IMAGE15, _Z21__gen_ocl_read_imagefjjfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE0, _Z21__gen_ocl_read_imageijtiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE1, _Z21__gen_ocl_read_imageijtffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE2, _Z22__gen_ocl_read_imageuijtiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE3, _Z22__gen_ocl_read_imageuijtffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE4, _Z21__gen_ocl_read_imagefjtiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE5, _Z21__gen_ocl_read_imagefjtffj)
+
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE10, _Z21__gen_ocl_read_imageijtiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE11, _Z21__gen_ocl_read_imageijtfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE12, _Z22__gen_ocl_read_imageuijtiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE13, _Z22__gen_ocl_read_imageuijtfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE14, _Z21__gen_ocl_read_imagefjtiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE15, _Z21__gen_ocl_read_imagefjtfffj)
 
 // To write_image functions.
 DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE0, _Z22__gen_ocl_write_imageijiiDv4_i)
@@ -177,3 +179,6 @@ DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_I32, _Z15convert_int_satf)
 
 DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_U32, _Z16convert_uint_sati)
 DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_U32, _Z16convert_uint_satf)
+
+DECL_LLVM_GEN_FUNCTION(CONV_F16_TO_F32, __gen_ocl_f16to32)
+DECL_LLVM_GEN_FUNCTION(CONV_F32_TO_F16, __gen_ocl_f32to16)
\ No newline at end of file
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
new file mode 100644
index 0000000..1942860
--- /dev/null
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * \file llvm_intrinisc_lowering.cpp
+ * \author Yang Rong <rong.r.yang at intel.com>
+ */
+
+#include "llvm/Config/config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+
+
+using namespace llvm;
+
+namespace gbe {
+    class InstrinsicLowering : public BasicBlockPass
+    {
+    public:
+      static char ID;
+      InstrinsicLowering() :
+        BasicBlockPass(ID) {}
+
+      void getAnalysisUsage(AnalysisUsage &AU) const {
+
+      }
+
+      virtual const char *getPassName() const {
+        return "SPIR backend: lowering instrinsics";
+      }
+      static char convertSpaceToName(Value *val) {
+        const uint32_t space = val->getType()->getPointerAddressSpace();
+        switch(space) {
+          case 0:
+            return 'p';
+          case 1:
+            return 'g';
+          case 3:
+            return 'l';
+          default:
+            assert("Non support address space");
+            return '\0';
+        }
+      }
+      static CallInst *replaceCallWith(const char *NewFn, CallInst *CI,
+                                     Value **ArgBegin, Value **ArgEnd,
+                                     Type *RetTy)
+      {
+        // If we haven't already looked up this function, check to see if the
+        // program already contains a function with this name.
+        Module *M = CI->getParent()->getParent()->getParent();
+        // Get or insert the definition now.
+        std::vector<Type *> ParamTys;
+        for (Value** I = ArgBegin; I != ArgEnd; ++I)
+          ParamTys.push_back((*I)->getType());
+        Constant* FCache = M->getOrInsertFunction(NewFn,
+                                        FunctionType::get(RetTy, ParamTys, false));
+
+        IRBuilder<> Builder(CI->getParent(), CI);
+        SmallVector<Value *, 8> Args(ArgBegin, ArgEnd);
+        CallInst *NewCI = Builder.CreateCall(FCache, Args);
+        NewCI->setName(CI->getName());
+        if (!CI->use_empty())
+          CI->replaceAllUsesWith(NewCI);
+        CI->eraseFromParent();
+        return NewCI;
+      }
+      virtual bool runOnBasicBlock(BasicBlock &BB)
+      {
+        bool changedBlock = false;
+        Module *M = BB.getParent()->getParent();
+
+        DataLayout TD(M);
+        LLVMContext &Context = BB.getContext();
+        for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
+          Instruction *Inst = DI++;
+          CallInst* CI = dyn_cast<CallInst>(Inst);
+          if(CI == NULL)
+            continue;
+
+          IRBuilder<> Builder(&BB, CI);
+          // only support memcpy and memset
+          if (Function *F = CI->getCalledFunction()) {
+            const Intrinsic::ID intrinsicID = (Intrinsic::ID) F->getIntrinsicID();
+            if (intrinsicID == 0)
+              continue;
+            switch (intrinsicID) {
+              case Intrinsic::memcpy: {
+                Type *IntPtr = TD.getIntPtrType(Context);
+                Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+                                                    /* isSigned */ false);
+                Value *Ops[3];
+                Ops[0] = CI->getArgOperand(0);
+                Ops[1] = CI->getArgOperand(1);
+                Ops[2] = Size;
+                char name[16] = "__gen_memcpy_xx";
+                name[13] = convertSpaceToName(Ops[0]);
+                name[14] = convertSpaceToName(Ops[1]);
+                replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
+                break;
+              }
+              case Intrinsic::memset: {
+                Value *Op0 = CI->getArgOperand(0);
+                Value *val = Builder.CreateIntCast(CI->getArgOperand(1), IntegerType::getInt8Ty(Context),
+                                                    /* isSigned */ false);
+                Type *IntPtr = TD.getIntPtrType(Op0->getType());
+                Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+                                                    /* isSigned */ false);
+                Value *Ops[3];
+                Ops[0] = Op0;
+                // Extend the amount to i32.
+                Ops[1] = val;
+                Ops[2] = Size;
+                char name[16] = "__gen_memset_x";
+                name[13] = convertSpaceToName(Ops[0]);
+                replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
+                break;
+              }
+              default:
+                continue;
+            }
+          }
+        }
+        return changedBlock;
+      }
+    };
+
+    char InstrinsicLowering::ID = 0;
+
+    BasicBlockPass *createIntrinsicLoweringPass() {
+      return new InstrinsicLowering();
+    }
+} // end namespace
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index 60c9df1..d30a570 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -64,7 +64,13 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+#include "llvm/IR/Mangler.h"
+#else
 #include "llvm/Target/Mangler.h"
+#endif
+
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -128,8 +134,6 @@ namespace gbe
 
   uint32_t getAlignmentByte(const ir::Unit &unit, Type* Ty)
   {
-    const uint32_t MAX_ALIGN = 8; //maximum size is 8 for doubles
-
     switch (Ty->getTypeID()) {
       case Type::VoidTyID: NOT_SUPPORTED;
       case Type::VectorTyID:
@@ -143,6 +147,7 @@ namespace gbe
       case Type::IntegerTyID:
       case Type::FloatTyID:
       case Type::DoubleTyID:
+      case Type::HalfTyID:
         return getTypeBitSize(unit, Ty)/8;
       case Type::ArrayTyID:
         return getAlignmentByte(unit, cast<ArrayType>(Ty)->getElementType());
@@ -153,8 +158,6 @@ namespace gbe
         for(uint32_t subtype = 0; subtype < StrTy->getNumElements(); subtype++)
         {
           maxa = std::max(getAlignmentByte(unit, StrTy->getElementType(subtype)), maxa);
-          if(maxa==MAX_ALIGN)
-            return maxa;
         }
         return maxa;
       }
@@ -169,6 +172,7 @@ namespace gbe
       case Type::VoidTyID:    NOT_SUPPORTED;
       case Type::PointerTyID: return unit.getPointerSize();
       case Type::IntegerTyID: return cast<IntegerType>(Ty)->getBitWidth();
+      case Type::HalfTyID:    return 16;
       case Type::FloatTyID:   return 32;
       case Type::DoubleTyID:  return 64;
       case Type::VectorTyID:
@@ -234,7 +238,7 @@ namespace gbe
     }
 
     virtual const char *getPassName() const {
-      return "PTX backend: insert special ptx instructions";
+      return "SPIR backend: insert special spir instructions";
     }
 
     bool simplifyGEPInstructions(GetElementPtrInst* GEPInst);
@@ -276,18 +280,28 @@ namespace gbe
       {
         uint32_t offset = 0;
         TypeIndex = ConstOP->getZExtValue();
-        for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
-        {
-          Type* elementType = CompTy->getTypeAtIndex(ty_i);
-          uint32_t align = getAlignmentByte(unit, elementType);
+        if (op == 1) {
+          if (TypeIndex != 0) {
+            Type *elementType = (cast<PointerType>(parentPointer->getType()))->getElementType();
+            uint32_t elementSize = getTypeByteSize(unit, elementType);
+            uint32_t align = getAlignmentByte(unit, elementType);
+            elementSize += getPadding(elementSize, align);
+            offset += elementSize * TypeIndex;
+          }
+        } else {
+          for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
+          {
+            Type* elementType = CompTy->getTypeAtIndex(ty_i);
+            uint32_t align = getAlignmentByte(unit, elementType);
+            offset += getPadding(offset, align);
+            offset += getTypeByteSize(unit, elementType);
+          }
+
+          //add getPaddingding for accessed type
+          const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
           offset += getPadding(offset, align);
-          offset += getTypeByteSize(unit, elementType);
         }
 
-        //add getPaddingding for accessed type
-        const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
-        offset += getPadding(offset, align);
-
         constantOffset += offset;
       }
       // none constant index (=> only array/verctor allowed)
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index a29bc59..c1790f7 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -66,7 +66,7 @@
 #include "llvm/Config/config.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#if LLVM_VERSION_MINOR <= 2
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
 #include "llvm/Function.h"
 #include "llvm/InstrTypes.h"
 #include "llvm/Instructions.h"
@@ -80,7 +80,7 @@
 #include "llvm/IR/Module.h"
 #endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/Pass.h"
-#if LLVM_VERSION_MINOR <= 1
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 1
 #include "llvm/Support/IRBuilder.h"
 #elif LLVM_VERSION_MINOR == 2
 #include "llvm/IRBuilder.h"
@@ -94,7 +94,6 @@
 #include "llvm/llvm_gen_backend.hpp"
 #include "sys/map.hpp"
 
-
 using namespace llvm;
 
 namespace gbe {
@@ -128,7 +127,11 @@ namespace gbe {
     Scalarize() : FunctionPass(ID)
     {
       initializeLoopInfoPass(*PassRegistry::getPassRegistry());
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+      initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
+#else
       initializeDominatorTreePass(*PassRegistry::getPassRegistry());
+#endif
     }
 
     virtual bool runOnFunction(Function&);
@@ -143,6 +146,7 @@ namespace gbe {
     // Take an instruction that produces a vector, and scalarize it
     bool scalarize(Instruction*);
     bool scalarizePerComponent(Instruction*);
+    bool scalarizeBitCast(BitCastInst *);
     bool scalarizeFuncCall(CallInst *);
     bool scalarizeLoad(LoadInst*);
     bool scalarizeStore(StoreInst*);
@@ -221,6 +225,12 @@ namespace gbe {
       return GetComponentCount(value->getType());
     }
 
+    /* set to insert new instructions after the specified instruction.*/
+    void setAppendPoint(Instruction *insn)  {
+      BasicBlock::iterator next(insn);
+      builder->SetInsertPoint(++next);
+    }
+
     DenseMap<Value*, VectorValues> vectorVals;
     Module* module;
     IRBuilder<>* builder;
@@ -491,6 +501,10 @@ namespace gbe {
     if (IsPerComponentOp(inst))
       return scalarizePerComponent(inst);
 
+    //not Per Component bitcast, for example <2 * i8> -> i16, handle it in backend
+    if (BitCastInst* bt = dyn_cast<BitCastInst>(inst))
+      return scalarizeBitCast(bt);
+
     if (LoadInst* ld = dyn_cast<LoadInst>(inst))
       return scalarizeLoad(ld);
 
@@ -644,6 +658,7 @@ namespace gbe {
           case GEN_OCL_GET_IMAGE_WIDTH:
           case GEN_OCL_GET_IMAGE_HEIGHT:
           {
+            setAppendPoint(call);
             extractFromVector(call);
             break;
           }
@@ -670,8 +685,18 @@ namespace gbe {
     return false;
   }
 
+  bool Scalarize::scalarizeBitCast(BitCastInst* bt)
+  {
+    if(bt->getOperand(0)->getType()->isVectorTy())
+      bt->setOperand(0, InsertToVector(bt, bt->getOperand(0)));
+    if(bt->getType()->isVectorTy())
+      extractFromVector(bt);
+    return false;
+  }
+
   bool Scalarize::scalarizeLoad(LoadInst* ld)
   {
+    setAppendPoint(ld);
     extractFromVector(ld);
     return false;
   }
@@ -745,7 +770,7 @@ namespace gbe {
   bool Scalarize::runOnFunction(Function& F)
   {
     switch (F.getCallingConv()) {
-#if LLVM_VERSION_MINOR <= 2
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
     case CallingConv::PTX_Device:
       return false;
     case CallingConv::PTX_Kernel:
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 111514f..8b2ac04 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -23,16 +23,22 @@
  */
 
 #include "llvm/Config/config.h"
-#if LLVM_VERSION_MINOR <= 2
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
+#include "llvm/DataLayout.h"
 #else
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/DataLayout.h"
 #endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/PassManager.h"
 #include "llvm/Pass.h"
-#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/ADT/Triple.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
 #include "llvm/Support/IRReader.h"
 #else
 #include "llvm/IRReader/IRReader.h"
@@ -40,7 +46,14 @@
 #endif  /* LLVM_VERSION_MINOR <= 2 */
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Verifier.h"
+#else
+#include "llvm/Analysis/Verifier.h"
 #include "llvm/Assembly/PrintModulePass.h"
+#endif
 
 #include "llvm/llvm_gen_backend.hpp"
 #include "llvm/llvm_to_gen.hpp"
@@ -56,11 +69,96 @@ namespace gbe
 {
   BVAR(OCL_OUTPUT_LLVM, false);
   BVAR(OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS, false);
+  using namespace llvm;
+
+  void runFuntionPass(Module &mod, TargetLibraryInfo *libraryInfo)
+  {
+    FunctionPassManager FPM(&mod);
+    FPM.add(new DataLayout(&mod));
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+    FPM.add(createVerifierPass(true));
+#else
+    FPM.add(createVerifierPass());
+#endif
+    FPM.add(new TargetLibraryInfo(*libraryInfo));
+    FPM.add(createTypeBasedAliasAnalysisPass());
+    FPM.add(createBasicAliasAnalysisPass());
+    FPM.add(createCFGSimplificationPass());
+    FPM.add(createSROAPass());
+    FPM.add(createEarlyCSEPass());
+    FPM.add(createLowerExpectIntrinsicPass());
+
+    FPM.doInitialization();
+    for (Module::iterator I = mod.begin(),
+           E = mod.end(); I != E; ++I)
+      if (!I->isDeclaration())
+        FPM.run(*I);
+    FPM.doFinalization();
+  }
 
-  bool llvmToGen(ir::Unit &unit, const char *fileName)
+  void runModulePass(Module &mod, TargetLibraryInfo *libraryInfo, int optLevel)
   {
-    using namespace llvm;
+    llvm::PassManager MPM;
+
+    MPM.add(new DataLayout(&mod));
+    MPM.add(new TargetLibraryInfo(*libraryInfo));
+    MPM.add(createTypeBasedAliasAnalysisPass());
+    MPM.add(createBasicAliasAnalysisPass());
+    MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
+
+    MPM.add(createIPSCCPPass());              // IP SCCP
+    MPM.add(createDeadArgEliminationPass());  // Dead argument elimination
+
+    MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
+    MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
+    MPM.add(createPruneEHPass());             // Remove dead EH info
+    MPM.add(createFunctionInliningPass(200000));
+    MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
+
+    //MPM.add(createScalarReplAggregatesPass(64, true, -1, -1, 64))
+    //MPM.add(createSROAPass(/*RequiresDomTree*/ false));
+    MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+    MPM.add(createJumpThreadingPass());         // Thread jumps.
+    MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+    MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+    MPM.add(createInstructionCombiningPass());  // Combine silly seq's
+
+    MPM.add(createTailCallEliminationPass());   // Eliminate tail calls
+    MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+    MPM.add(createReassociatePass());           // Reassociate expressions
+    MPM.add(createLoopRotatePass());            // Rotate Loop
+    MPM.add(createLICMPass());                  // Hoist loop invariants
+    MPM.add(createLoopUnswitchPass(true));
+    MPM.add(createInstructionCombiningPass());
+    MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
+    MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
+    MPM.add(createLoopDeletionPass());          // Delete dead loops
+    MPM.add(createLoopUnrollPass());          // Unroll small loops
+    if(optLevel > 0)
+      MPM.add(createGVNPass(true));                 // Remove redundancies
+    MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
+    MPM.add(createSCCPPass());                  // Constant prop with SCCP
+
+    // Run instcombine after redundancy elimination to exploit opportunities
+    // opened up by them.
+    MPM.add(createInstructionCombiningPass());
+    MPM.add(createJumpThreadingPass());         // Thread jumps
+    MPM.add(createCorrelatedValuePropagationPass());
+    MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
+    MPM.add(createAggressiveDCEPass());         // Delete dead instructions
+    MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
+    MPM.add(createInstructionCombiningPass());  // Clean up after everything.
+    MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
+    if(optLevel > 0) {
+      MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.
+      MPM.add(createConstantMergePass());     // Merge dup global constants
+    }
 
+    MPM.run(mod);
+  }
+
+  bool llvmToGen(ir::Unit &unit, const char *fileName, int optLevel)
+  {
     // Get the global LLVM context
     llvm::LLVMContext& c = llvm::getGlobalContext();
     std::string errInfo;
@@ -75,27 +173,43 @@ namespace gbe
     if (M.get() == 0) return false;
     Module &mod = *M.get();
 
+    Triple TargetTriple(mod.getTargetTriple());
+    TargetLibraryInfo *libraryInfo = new TargetLibraryInfo(TargetTriple);
+    libraryInfo->disableAllFunctions();
+
+    runFuntionPass(mod, libraryInfo);
+    runModulePass(mod, libraryInfo, optLevel);
+
     llvm::PassManager passes;
 
     // Print the code before further optimizations
     if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+      passes.add(createPrintModulePass(*o));
+#else
       passes.add(createPrintModulePass(&*o));
-    passes.add(createScalarizePass());        // Expand all vector ops
+#endif
+    passes.add(createIntrinsicLoweringPass());
+    passes.add(createFunctionInliningPass(200000));
     passes.add(createScalarReplAggregatesPass()); // Break up allocas
     passes.add(createRemoveGEPPass(unit));
     passes.add(createConstantPropagationPass());
-    passes.add(createDeadInstEliminationPass());  // Remove simplified instructions
     passes.add(createLowerSwitchPass());
     passes.add(createPromoteMemoryToRegisterPass());
     passes.add(createGVNPass());                  // Remove redundancies
+    passes.add(createScalarizePass());        // Expand all vector ops
+    passes.add(createDeadInstEliminationPass());  // Remove simplified instructions
     passes.add(createGenPass(unit));
 
     // Print the code extra optimization passes
     if (OCL_OUTPUT_LLVM)
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+      passes.add(createPrintModulePass(*o));
+#else
       passes.add(createPrintModulePass(&*o));
+#endif
     passes.run(mod);
 
     return true;
   }
 } /* namespace gbe */
-
diff --git a/backend/src/llvm/llvm_to_gen.hpp b/backend/src/llvm/llvm_to_gen.hpp
index 4006667..50ea267 100644
--- a/backend/src/llvm/llvm_to_gen.hpp
+++ b/backend/src/llvm/llvm_to_gen.hpp
@@ -30,8 +30,9 @@ namespace gbe {
     class Unit;
   } /* namespace ir */
 
-  /*! Convert the LLVM IR code to a GEN IR code */
-  bool llvmToGen(ir::Unit &unit, const char *fileName);
+  /*! Convert the LLVM IR code to a GEN IR code,
+		  optLevel 0 equal to clang -O1 and 1 equal to clang -O2*/
+  bool llvmToGen(ir::Unit &unit, const char *fileName, int optLevel);
 
 } /* namespace gbe */
 
diff --git a/backend/src/ocl_as.h b/backend/src/ocl_as.h
index af98d53..692e892 100644
--- a/backend/src/ocl_as.h
+++ b/backend/src/ocl_as.h
@@ -96,29 +96,14 @@ INLINE OVERLOADABLE char2 as_char2(uchar2 v) {
   return u._char2;
 }
 
-union _type_cast_3_b {
-  char3 _char3;
-  uchar3 _uchar3;
-};
-
-INLINE OVERLOADABLE uchar3 as_uchar3(char3 v) {
-  union _type_cast_3_b u;
-  u._char3 = v;
-  return u._uchar3;
-}
-
-INLINE OVERLOADABLE char3 as_char3(uchar3 v) {
-  union _type_cast_3_b u;
-  u._uchar3 = v;
-  return u._char3;
-}
-
 union _type_cast_4_b {
   int _int;
   uint _uint;
   short2 _short2;
   ushort2 _ushort2;
+  char3 _char3;
   char4 _char4;
+  uchar3 _uchar3;
   uchar4 _uchar4;
   float _float;
 };
@@ -141,12 +126,24 @@ INLINE OVERLOADABLE ushort2 as_ushort2(int v) {
   return u._ushort2;
 }
 
+INLINE OVERLOADABLE char3 as_char3(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._char3;
+}
+
 INLINE OVERLOADABLE char4 as_char4(int v) {
   union _type_cast_4_b u;
   u._int = v;
   return u._char4;
 }
 
+INLINE OVERLOADABLE uchar3 as_uchar3(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._uchar3;
+}
+
 INLINE OVERLOADABLE uchar4 as_uchar4(int v) {
   union _type_cast_4_b u;
   u._int = v;
@@ -177,12 +174,24 @@ INLINE OVERLOADABLE ushort2 as_ushort2(uint v) {
   return u._ushort2;
 }
 
+INLINE OVERLOADABLE char3 as_char3(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._char3;
+}
+
 INLINE OVERLOADABLE char4 as_char4(uint v) {
   union _type_cast_4_b u;
   u._uint = v;
   return u._char4;
 }
 
+INLINE OVERLOADABLE uchar3 as_uchar3(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._uchar3;
+}
+
 INLINE OVERLOADABLE uchar4 as_uchar4(uint v) {
   union _type_cast_4_b u;
   u._uint = v;
@@ -213,12 +222,24 @@ INLINE OVERLOADABLE ushort2 as_ushort2(short2 v) {
   return u._ushort2;
 }
 
+INLINE OVERLOADABLE char3 as_char3(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._char3;
+}
+
 INLINE OVERLOADABLE char4 as_char4(short2 v) {
   union _type_cast_4_b u;
   u._short2 = v;
   return u._char4;
 }
 
+INLINE OVERLOADABLE uchar3 as_uchar3(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._uchar3;
+}
+
 INLINE OVERLOADABLE uchar4 as_uchar4(short2 v) {
   union _type_cast_4_b u;
   u._short2 = v;
@@ -249,12 +270,24 @@ INLINE OVERLOADABLE short2 as_short2(ushort2 v) {
   return u._short2;
 }
 
+INLINE OVERLOADABLE char3 as_char3(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._char3;
+}
+
 INLINE OVERLOADABLE char4 as_char4(ushort2 v) {
   union _type_cast_4_b u;
   u._ushort2 = v;
   return u._char4;
 }
 
+INLINE OVERLOADABLE uchar3 as_uchar3(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._uchar3;
+}
+
 INLINE OVERLOADABLE uchar4 as_uchar4(ushort2 v) {
   union _type_cast_4_b u;
   u._ushort2 = v;
@@ -267,6 +300,48 @@ INLINE OVERLOADABLE float as_float(ushort2 v) {
   return u._float;
 }
 
+INLINE OVERLOADABLE int as_int(char3 v) {
+  union _type_cast_4_b u;
+  u._char3 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(char3 v) {
+  union _type_cast_4_b u;
+  u._char3 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(char3 v) {
+  union _type_cast_4_b u;
+  u._char3 = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(char3 v) {
+  union _type_cast_4_b u;
+  u._char3 = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(char3 v) {
+  union _type_cast_4_b u;
+  u._char3 = v;
+  return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(char3 v) {
+  union _type_cast_4_b u;
+  u._char3 = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(char3 v) {
+  union _type_cast_4_b u;
+  u._char3 = v;
+  return u._float;
+}
+
 INLINE OVERLOADABLE int as_int(char4 v) {
   union _type_cast_4_b u;
   u._char4 = v;
@@ -291,6 +366,12 @@ INLINE OVERLOADABLE ushort2 as_ushort2(char4 v) {
   return u._ushort2;
 }
 
+INLINE OVERLOADABLE uchar3 as_uchar3(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._uchar3;
+}
+
 INLINE OVERLOADABLE uchar4 as_uchar4(char4 v) {
   union _type_cast_4_b u;
   u._char4 = v;
@@ -303,6 +384,48 @@ INLINE OVERLOADABLE float as_float(char4 v) {
   return u._float;
 }
 
+INLINE OVERLOADABLE int as_int(uchar3 v) {
+  union _type_cast_4_b u;
+  u._uchar3 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(uchar3 v) {
+  union _type_cast_4_b u;
+  u._uchar3 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(uchar3 v) {
+  union _type_cast_4_b u;
+  u._uchar3 = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(uchar3 v) {
+  union _type_cast_4_b u;
+  u._uchar3 = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(uchar3 v) {
+  union _type_cast_4_b u;
+  u._uchar3 = v;
+  return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(uchar3 v) {
+  union _type_cast_4_b u;
+  u._uchar3 = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE float as_float(uchar3 v) {
+  union _type_cast_4_b u;
+  u._uchar3 = v;
+  return u._float;
+}
+
 INLINE OVERLOADABLE int as_int(uchar4 v) {
   union _type_cast_4_b u;
   u._uchar4 = v;
@@ -327,6 +450,12 @@ INLINE OVERLOADABLE ushort2 as_ushort2(uchar4 v) {
   return u._ushort2;
 }
 
+INLINE OVERLOADABLE char3 as_char3(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._char3;
+}
+
 INLINE OVERLOADABLE char4 as_char4(uchar4 v) {
   union _type_cast_4_b u;
   u._uchar4 = v;
@@ -363,33 +492,28 @@ INLINE OVERLOADABLE ushort2 as_ushort2(float v) {
   return u._ushort2;
 }
 
-INLINE OVERLOADABLE char4 as_char4(float v) {
+INLINE OVERLOADABLE char3 as_char3(float v) {
   union _type_cast_4_b u;
   u._float = v;
-  return u._char4;
+  return u._char3;
 }
 
-INLINE OVERLOADABLE uchar4 as_uchar4(float v) {
+INLINE OVERLOADABLE char4 as_char4(float v) {
   union _type_cast_4_b u;
   u._float = v;
-  return u._uchar4;
+  return u._char4;
 }
 
-union _type_cast_6_b {
-  short3 _short3;
-  ushort3 _ushort3;
-};
-
-INLINE OVERLOADABLE ushort3 as_ushort3(short3 v) {
-  union _type_cast_6_b u;
-  u._short3 = v;
-  return u._ushort3;
+INLINE OVERLOADABLE uchar3 as_uchar3(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._uchar3;
 }
 
-INLINE OVERLOADABLE short3 as_short3(ushort3 v) {
-  union _type_cast_6_b u;
-  u._ushort3 = v;
-  return u._short3;
+INLINE OVERLOADABLE uchar4 as_uchar4(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._uchar4;
 }
 
 union _type_cast_8_b {
@@ -397,7 +521,9 @@ union _type_cast_8_b {
   ulong _ulong;
   int2 _int2;
   uint2 _uint2;
+  short3 _short3;
   short4 _short4;
+  ushort3 _ushort3;
   ushort4 _ushort4;
   char8 _char8;
   uchar8 _uchar8;
@@ -423,12 +549,24 @@ INLINE OVERLOADABLE uint2 as_uint2(long v) {
   return u._uint2;
 }
 
+INLINE OVERLOADABLE short3 as_short3(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._short3;
+}
+
 INLINE OVERLOADABLE short4 as_short4(long v) {
   union _type_cast_8_b u;
   u._long = v;
   return u._short4;
 }
 
+INLINE OVERLOADABLE ushort3 as_ushort3(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._ushort3;
+}
+
 INLINE OVERLOADABLE ushort4 as_ushort4(long v) {
   union _type_cast_8_b u;
   u._long = v;
@@ -477,12 +615,24 @@ INLINE OVERLOADABLE uint2 as_uint2(ulong v) {
   return u._uint2;
 }
 
+INLINE OVERLOADABLE short3 as_short3(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._short3;
+}
+
 INLINE OVERLOADABLE short4 as_short4(ulong v) {
   union _type_cast_8_b u;
   u._ulong = v;
   return u._short4;
 }
 
+INLINE OVERLOADABLE ushort3 as_ushort3(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._ushort3;
+}
+
 INLINE OVERLOADABLE ushort4 as_ushort4(ulong v) {
   union _type_cast_8_b u;
   u._ulong = v;
@@ -531,12 +681,24 @@ INLINE OVERLOADABLE uint2 as_uint2(int2 v) {
   return u._uint2;
 }
 
+INLINE OVERLOADABLE short3 as_short3(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._short3;
+}
+
 INLINE OVERLOADABLE short4 as_short4(int2 v) {
   union _type_cast_8_b u;
   u._int2 = v;
   return u._short4;
 }
 
+INLINE OVERLOADABLE ushort3 as_ushort3(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._ushort3;
+}
+
 INLINE OVERLOADABLE ushort4 as_ushort4(int2 v) {
   union _type_cast_8_b u;
   u._int2 = v;
@@ -585,12 +747,24 @@ INLINE OVERLOADABLE int2 as_int2(uint2 v) {
   return u._int2;
 }
 
+INLINE OVERLOADABLE short3 as_short3(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._short3;
+}
+
 INLINE OVERLOADABLE short4 as_short4(uint2 v) {
   union _type_cast_8_b u;
   u._uint2 = v;
   return u._short4;
 }
 
+INLINE OVERLOADABLE ushort3 as_ushort3(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._ushort3;
+}
+
 INLINE OVERLOADABLE ushort4 as_ushort4(uint2 v) {
   union _type_cast_8_b u;
   u._uint2 = v;
@@ -621,88 +795,220 @@ INLINE OVERLOADABLE float2 as_float2(uint2 v) {
   return u._float2;
 }
 
-INLINE OVERLOADABLE long as_long(short4 v) {
+INLINE OVERLOADABLE long as_long(short3 v) {
   union _type_cast_8_b u;
-  u._short4 = v;
+  u._short3 = v;
   return u._long;
 }
 
-INLINE OVERLOADABLE ulong as_ulong(short4 v) {
+INLINE OVERLOADABLE ulong as_ulong(short3 v) {
   union _type_cast_8_b u;
-  u._short4 = v;
+  u._short3 = v;
   return u._ulong;
 }
 
-INLINE OVERLOADABLE int2 as_int2(short4 v) {
+INLINE OVERLOADABLE int2 as_int2(short3 v) {
   union _type_cast_8_b u;
-  u._short4 = v;
+  u._short3 = v;
   return u._int2;
 }
 
-INLINE OVERLOADABLE uint2 as_uint2(short4 v) {
+INLINE OVERLOADABLE uint2 as_uint2(short3 v) {
   union _type_cast_8_b u;
-  u._short4 = v;
+  u._short3 = v;
   return u._uint2;
 }
 
-INLINE OVERLOADABLE ushort4 as_ushort4(short4 v) {
+INLINE OVERLOADABLE ushort3 as_ushort3(short3 v) {
   union _type_cast_8_b u;
-  u._short4 = v;
+  u._short3 = v;
+  return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(short3 v) {
+  union _type_cast_8_b u;
+  u._short3 = v;
   return u._ushort4;
 }
 
-INLINE OVERLOADABLE char8 as_char8(short4 v) {
+INLINE OVERLOADABLE char8 as_char8(short3 v) {
   union _type_cast_8_b u;
-  u._short4 = v;
+  u._short3 = v;
   return u._char8;
 }
 
-INLINE OVERLOADABLE uchar8 as_uchar8(short4 v) {
+INLINE OVERLOADABLE uchar8 as_uchar8(short3 v) {
   union _type_cast_8_b u;
-  u._short4 = v;
+  u._short3 = v;
   return u._uchar8;
 }
 
-INLINE OVERLOADABLE double as_double(short4 v) {
+INLINE OVERLOADABLE double as_double(short3 v) {
   union _type_cast_8_b u;
-  u._short4 = v;
+  u._short3 = v;
   return u._double;
 }
 
-INLINE OVERLOADABLE float2 as_float2(short4 v) {
+INLINE OVERLOADABLE float2 as_float2(short3 v) {
   union _type_cast_8_b u;
-  u._short4 = v;
+  u._short3 = v;
   return u._float2;
 }
 
-INLINE OVERLOADABLE long as_long(ushort4 v) {
+INLINE OVERLOADABLE long as_long(short4 v) {
   union _type_cast_8_b u;
-  u._ushort4 = v;
+  u._short4 = v;
   return u._long;
 }
 
-INLINE OVERLOADABLE ulong as_ulong(ushort4 v) {
+INLINE OVERLOADABLE ulong as_ulong(short4 v) {
   union _type_cast_8_b u;
-  u._ushort4 = v;
+  u._short4 = v;
   return u._ulong;
 }
 
-INLINE OVERLOADABLE int2 as_int2(ushort4 v) {
+INLINE OVERLOADABLE int2 as_int2(short4 v) {
   union _type_cast_8_b u;
-  u._ushort4 = v;
+  u._short4 = v;
   return u._int2;
 }
 
-INLINE OVERLOADABLE uint2 as_uint2(ushort4 v) {
+INLINE OVERLOADABLE uint2 as_uint2(short4 v) {
   union _type_cast_8_b u;
-  u._ushort4 = v;
+  u._short4 = v;
   return u._uint2;
 }
 
-INLINE OVERLOADABLE short4 as_short4(ushort4 v) {
+INLINE OVERLOADABLE ushort3 as_ushort3(short4 v) {
   union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._short4;
+  u._short4 = v;
+  return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._short4;
 }
 
 INLINE OVERLOADABLE char8 as_char8(ushort4 v) {
@@ -753,12 +1059,24 @@ INLINE OVERLOADABLE uint2 as_uint2(char8 v) {
   return u._uint2;
 }
 
+INLINE OVERLOADABLE short3 as_short3(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._short3;
+}
+
 INLINE OVERLOADABLE short4 as_short4(char8 v) {
   union _type_cast_8_b u;
   u._char8 = v;
   return u._short4;
 }
 
+INLINE OVERLOADABLE ushort3 as_ushort3(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._ushort3;
+}
+
 INLINE OVERLOADABLE ushort4 as_ushort4(char8 v) {
   union _type_cast_8_b u;
   u._char8 = v;
@@ -807,12 +1125,24 @@ INLINE OVERLOADABLE uint2 as_uint2(uchar8 v) {
   return u._uint2;
 }
 
+INLINE OVERLOADABLE short3 as_short3(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._short3;
+}
+
 INLINE OVERLOADABLE short4 as_short4(uchar8 v) {
   union _type_cast_8_b u;
   u._uchar8 = v;
   return u._short4;
 }
 
+INLINE OVERLOADABLE ushort3 as_ushort3(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._ushort3;
+}
+
 INLINE OVERLOADABLE ushort4 as_ushort4(uchar8 v) {
   union _type_cast_8_b u;
   u._uchar8 = v;
@@ -861,12 +1191,24 @@ INLINE OVERLOADABLE uint2 as_uint2(double v) {
   return u._uint2;
 }
 
+INLINE OVERLOADABLE short3 as_short3(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._short3;
+}
+
 INLINE OVERLOADABLE short4 as_short4(double v) {
   union _type_cast_8_b u;
   u._double = v;
   return u._short4;
 }
 
+INLINE OVERLOADABLE ushort3 as_ushort3(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._ushort3;
+}
+
 INLINE OVERLOADABLE ushort4 as_ushort4(double v) {
   union _type_cast_8_b u;
   u._double = v;
@@ -915,12 +1257,24 @@ INLINE OVERLOADABLE uint2 as_uint2(float2 v) {
   return u._uint2;
 }
 
+INLINE OVERLOADABLE short3 as_short3(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._short3;
+}
+
 INLINE OVERLOADABLE short4 as_short4(float2 v) {
   union _type_cast_8_b u;
   u._float2 = v;
   return u._short4;
 }
 
+INLINE OVERLOADABLE ushort3 as_ushort3(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._ushort3;
+}
+
 INLINE OVERLOADABLE ushort4 as_ushort4(float2 v) {
   union _type_cast_8_b u;
   u._float2 = v;
@@ -945,58 +1299,19 @@ INLINE OVERLOADABLE double as_double(float2 v) {
   return u._double;
 }
 
-union _type_cast_12_b {
-  int3 _int3;
-  uint3 _uint3;
-  float3 _float3;
-};
-
-INLINE OVERLOADABLE uint3 as_uint3(int3 v) {
-  union _type_cast_12_b u;
-  u._int3 = v;
-  return u._uint3;
-}
-
-INLINE OVERLOADABLE float3 as_float3(int3 v) {
-  union _type_cast_12_b u;
-  u._int3 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE int3 as_int3(uint3 v) {
-  union _type_cast_12_b u;
-  u._uint3 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE float3 as_float3(uint3 v) {
-  union _type_cast_12_b u;
-  u._uint3 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE int3 as_int3(float3 v) {
-  union _type_cast_12_b u;
-  u._float3 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE uint3 as_uint3(float3 v) {
-  union _type_cast_12_b u;
-  u._float3 = v;
-  return u._uint3;
-}
-
 union _type_cast_16_b {
   long2 _long2;
   ulong2 _ulong2;
+  int3 _int3;
   int4 _int4;
+  uint3 _uint3;
   uint4 _uint4;
   short8 _short8;
   ushort8 _ushort8;
   char16 _char16;
   uchar16 _uchar16;
   double2 _double2;
+  float3 _float3;
   float4 _float4;
 };
 
@@ -1006,12 +1321,24 @@ INLINE OVERLOADABLE ulong2 as_ulong2(long2 v) {
   return u._ulong2;
 }
 
+INLINE OVERLOADABLE int3 as_int3(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._int3;
+}
+
 INLINE OVERLOADABLE int4 as_int4(long2 v) {
   union _type_cast_16_b u;
   u._long2 = v;
   return u._int4;
 }
 
+INLINE OVERLOADABLE uint3 as_uint3(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._uint3;
+}
+
 INLINE OVERLOADABLE uint4 as_uint4(long2 v) {
   union _type_cast_16_b u;
   u._long2 = v;
@@ -1048,6 +1375,12 @@ INLINE OVERLOADABLE double2 as_double2(long2 v) {
   return u._double2;
 }
 
+INLINE OVERLOADABLE float3 as_float3(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._float3;
+}
+
 INLINE OVERLOADABLE float4 as_float4(long2 v) {
   union _type_cast_16_b u;
   u._long2 = v;
@@ -1060,12 +1393,24 @@ INLINE OVERLOADABLE long2 as_long2(ulong2 v) {
   return u._long2;
 }
 
+INLINE OVERLOADABLE int3 as_int3(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._int3;
+}
+
 INLINE OVERLOADABLE int4 as_int4(ulong2 v) {
   union _type_cast_16_b u;
   u._ulong2 = v;
   return u._int4;
 }
 
+INLINE OVERLOADABLE uint3 as_uint3(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._uint3;
+}
+
 INLINE OVERLOADABLE uint4 as_uint4(ulong2 v) {
   union _type_cast_16_b u;
   u._ulong2 = v;
@@ -1074,91 +1419,241 @@ INLINE OVERLOADABLE uint4 as_uint4(ulong2 v) {
 
 INLINE OVERLOADABLE short8 as_short8(ulong2 v) {
   union _type_cast_16_b u;
-  u._ulong2 = v;
+  u._ulong2 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
   return u._short8;
 }
 
-INLINE OVERLOADABLE ushort8 as_ushort8(ulong2 v) {
+INLINE OVERLOADABLE ushort8 as_ushort8(int4 v) {
   union _type_cast_16_b u;
-  u._ulong2 = v;
+  u._int4 = v;
   return u._ushort8;
 }
 
-INLINE OVERLOADABLE char16 as_char16(ulong2 v) {
+INLINE OVERLOADABLE char16 as_char16(int4 v) {
   union _type_cast_16_b u;
-  u._ulong2 = v;
+  u._int4 = v;
   return u._char16;
 }
 
-INLINE OVERLOADABLE uchar16 as_uchar16(ulong2 v) {
+INLINE OVERLOADABLE uchar16 as_uchar16(int4 v) {
   union _type_cast_16_b u;
-  u._ulong2 = v;
+  u._int4 = v;
   return u._uchar16;
 }
 
-INLINE OVERLOADABLE double2 as_double2(ulong2 v) {
+INLINE OVERLOADABLE double2 as_double2(int4 v) {
   union _type_cast_16_b u;
-  u._ulong2 = v;
+  u._int4 = v;
   return u._double2;
 }
 
-INLINE OVERLOADABLE float4 as_float4(ulong2 v) {
+INLINE OVERLOADABLE float3 as_float3(int4 v) {
   union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._float4;
+  u._int4 = v;
+  return u._float3;
 }
 
-INLINE OVERLOADABLE long2 as_long2(int4 v) {
+INLINE OVERLOADABLE float4 as_float4(int4 v) {
   union _type_cast_16_b u;
   u._int4 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(uint3 v) {
+  union _type_cast_16_b u;
+  u._uint3 = v;
   return u._long2;
 }
 
-INLINE OVERLOADABLE ulong2 as_ulong2(int4 v) {
+INLINE OVERLOADABLE ulong2 as_ulong2(uint3 v) {
   union _type_cast_16_b u;
-  u._int4 = v;
+  u._uint3 = v;
   return u._ulong2;
 }
 
-INLINE OVERLOADABLE uint4 as_uint4(int4 v) {
+INLINE OVERLOADABLE int3 as_int3(uint3 v) {
   union _type_cast_16_b u;
-  u._int4 = v;
-  return u._uint4;
+  u._uint3 = v;
+  return u._int3;
 }
 
-INLINE OVERLOADABLE short8 as_short8(int4 v) {
+INLINE OVERLOADABLE int4 as_int4(uint3 v) {
   union _type_cast_16_b u;
-  u._int4 = v;
+  u._uint3 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(uint3 v) {
+  union _type_cast_16_b u;
+  u._uint3 = v;
   return u._short8;
 }
 
-INLINE OVERLOADABLE ushort8 as_ushort8(int4 v) {
+INLINE OVERLOADABLE ushort8 as_ushort8(uint3 v) {
   union _type_cast_16_b u;
-  u._int4 = v;
+  u._uint3 = v;
   return u._ushort8;
 }
 
-INLINE OVERLOADABLE char16 as_char16(int4 v) {
+INLINE OVERLOADABLE char16 as_char16(uint3 v) {
   union _type_cast_16_b u;
-  u._int4 = v;
+  u._uint3 = v;
   return u._char16;
 }
 
-INLINE OVERLOADABLE uchar16 as_uchar16(int4 v) {
+INLINE OVERLOADABLE uchar16 as_uchar16(uint3 v) {
   union _type_cast_16_b u;
-  u._int4 = v;
+  u._uint3 = v;
   return u._uchar16;
 }
 
-INLINE OVERLOADABLE double2 as_double2(int4 v) {
+INLINE OVERLOADABLE double2 as_double2(uint3 v) {
   union _type_cast_16_b u;
-  u._int4 = v;
+  u._uint3 = v;
   return u._double2;
 }
 
-INLINE OVERLOADABLE float4 as_float4(int4 v) {
+INLINE OVERLOADABLE float3 as_float3(uint3 v) {
   union _type_cast_16_b u;
-  u._int4 = v;
+  u._uint3 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(uint3 v) {
+  union _type_cast_16_b u;
+  u._uint3 = v;
   return u._float4;
 }
 
@@ -1174,6 +1669,12 @@ INLINE OVERLOADABLE ulong2 as_ulong2(uint4 v) {
   return u._ulong2;
 }
 
+INLINE OVERLOADABLE int3 as_int3(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._int3;
+}
+
 INLINE OVERLOADABLE int4 as_int4(uint4 v) {
   union _type_cast_16_b u;
   u._uint4 = v;
@@ -1210,6 +1711,12 @@ INLINE OVERLOADABLE double2 as_double2(uint4 v) {
   return u._double2;
 }
 
+INLINE OVERLOADABLE float3 as_float3(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._float3;
+}
+
 INLINE OVERLOADABLE float4 as_float4(uint4 v) {
   union _type_cast_16_b u;
   u._uint4 = v;
@@ -1228,12 +1735,24 @@ INLINE OVERLOADABLE ulong2 as_ulong2(short8 v) {
   return u._ulong2;
 }
 
+INLINE OVERLOADABLE int3 as_int3(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._int3;
+}
+
 INLINE OVERLOADABLE int4 as_int4(short8 v) {
   union _type_cast_16_b u;
   u._short8 = v;
   return u._int4;
 }
 
+INLINE OVERLOADABLE uint3 as_uint3(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._uint3;
+}
+
 INLINE OVERLOADABLE uint4 as_uint4(short8 v) {
   union _type_cast_16_b u;
   u._short8 = v;
@@ -1264,6 +1783,12 @@ INLINE OVERLOADABLE double2 as_double2(short8 v) {
   return u._double2;
 }
 
+INLINE OVERLOADABLE float3 as_float3(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._float3;
+}
+
 INLINE OVERLOADABLE float4 as_float4(short8 v) {
   union _type_cast_16_b u;
   u._short8 = v;
@@ -1282,12 +1807,24 @@ INLINE OVERLOADABLE ulong2 as_ulong2(ushort8 v) {
   return u._ulong2;
 }
 
+INLINE OVERLOADABLE int3 as_int3(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._int3;
+}
+
 INLINE OVERLOADABLE int4 as_int4(ushort8 v) {
   union _type_cast_16_b u;
   u._ushort8 = v;
   return u._int4;
 }
 
+INLINE OVERLOADABLE uint3 as_uint3(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._uint3;
+}
+
 INLINE OVERLOADABLE uint4 as_uint4(ushort8 v) {
   union _type_cast_16_b u;
   u._ushort8 = v;
@@ -1318,6 +1855,12 @@ INLINE OVERLOADABLE double2 as_double2(ushort8 v) {
   return u._double2;
 }
 
+INLINE OVERLOADABLE float3 as_float3(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._float3;
+}
+
 INLINE OVERLOADABLE float4 as_float4(ushort8 v) {
   union _type_cast_16_b u;
   u._ushort8 = v;
@@ -1336,12 +1879,24 @@ INLINE OVERLOADABLE ulong2 as_ulong2(char16 v) {
   return u._ulong2;
 }
 
+INLINE OVERLOADABLE int3 as_int3(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._int3;
+}
+
 INLINE OVERLOADABLE int4 as_int4(char16 v) {
   union _type_cast_16_b u;
   u._char16 = v;
   return u._int4;
 }
 
+INLINE OVERLOADABLE uint3 as_uint3(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._uint3;
+}
+
 INLINE OVERLOADABLE uint4 as_uint4(char16 v) {
   union _type_cast_16_b u;
   u._char16 = v;
@@ -1372,6 +1927,12 @@ INLINE OVERLOADABLE double2 as_double2(char16 v) {
   return u._double2;
 }
 
+INLINE OVERLOADABLE float3 as_float3(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._float3;
+}
+
 INLINE OVERLOADABLE float4 as_float4(char16 v) {
   union _type_cast_16_b u;
   u._char16 = v;
@@ -1390,12 +1951,24 @@ INLINE OVERLOADABLE ulong2 as_ulong2(uchar16 v) {
   return u._ulong2;
 }
 
+INLINE OVERLOADABLE int3 as_int3(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._int3;
+}
+
 INLINE OVERLOADABLE int4 as_int4(uchar16 v) {
   union _type_cast_16_b u;
   u._uchar16 = v;
   return u._int4;
 }
 
+INLINE OVERLOADABLE uint3 as_uint3(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._uint3;
+}
+
 INLINE OVERLOADABLE uint4 as_uint4(uchar16 v) {
   union _type_cast_16_b u;
   u._uchar16 = v;
@@ -1426,6 +1999,12 @@ INLINE OVERLOADABLE double2 as_double2(uchar16 v) {
   return u._double2;
 }
 
+INLINE OVERLOADABLE float3 as_float3(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._float3;
+}
+
 INLINE OVERLOADABLE float4 as_float4(uchar16 v) {
   union _type_cast_16_b u;
   u._uchar16 = v;
@@ -1444,12 +2023,24 @@ INLINE OVERLOADABLE ulong2 as_ulong2(double2 v) {
   return u._ulong2;
 }
 
+INLINE OVERLOADABLE int3 as_int3(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._int3;
+}
+
 INLINE OVERLOADABLE int4 as_int4(double2 v) {
   union _type_cast_16_b u;
   u._double2 = v;
   return u._int4;
 }
 
+INLINE OVERLOADABLE uint3 as_uint3(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._uint3;
+}
+
 INLINE OVERLOADABLE uint4 as_uint4(double2 v) {
   union _type_cast_16_b u;
   u._double2 = v;
@@ -1480,12 +2071,84 @@ INLINE OVERLOADABLE uchar16 as_uchar16(double2 v) {
   return u._uchar16;
 }
 
+INLINE OVERLOADABLE float3 as_float3(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._float3;
+}
+
 INLINE OVERLOADABLE float4 as_float4(double2 v) {
   union _type_cast_16_b u;
   u._double2 = v;
   return u._float4;
 }
 
+INLINE OVERLOADABLE long2 as_long2(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._double2;
+}
+
 INLINE OVERLOADABLE long2 as_long2(float4 v) {
   union _type_cast_16_b u;
   u._float4 = v;
@@ -1498,12 +2161,24 @@ INLINE OVERLOADABLE ulong2 as_ulong2(float4 v) {
   return u._ulong2;
 }
 
+INLINE OVERLOADABLE int3 as_int3(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._int3;
+}
+
 INLINE OVERLOADABLE int4 as_int4(float4 v) {
   union _type_cast_16_b u;
   u._float4 = v;
   return u._int4;
 }
 
+INLINE OVERLOADABLE uint3 as_uint3(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._uint3;
+}
+
 INLINE OVERLOADABLE uint4 as_uint4(float4 v) {
   union _type_cast_16_b u;
   u._float4 = v;
@@ -1528,71 +2203,92 @@ INLINE OVERLOADABLE char16 as_char16(float4 v) {
   return u._char16;
 }
 
-INLINE OVERLOADABLE uchar16 as_uchar16(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._uchar16;
+INLINE OVERLOADABLE uchar16 as_uchar16(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._double2;
+}
+
+union _type_cast_32_b {
+  long3 _long3;
+  long4 _long4;
+  ulong3 _ulong3;
+  ulong4 _ulong4;
+  int8 _int8;
+  uint8 _uint8;
+  short16 _short16;
+  ushort16 _ushort16;
+  double3 _double3;
+  double4 _double4;
+  float8 _float8;
+};
+
+INLINE OVERLOADABLE ulong3 as_ulong3(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._int8;
 }
 
-INLINE OVERLOADABLE double2 as_double2(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._double2;
+INLINE OVERLOADABLE uint8 as_uint8(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._uint8;
 }
 
-union _type_cast_24_b {
-  long3 _long3;
-  ulong3 _ulong3;
-  double3 _double3;
-};
+INLINE OVERLOADABLE short16 as_short16(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._short16;
+}
 
-INLINE OVERLOADABLE ulong3 as_ulong3(long3 v) {
-  union _type_cast_24_b u;
+INLINE OVERLOADABLE ushort16 as_ushort16(long3 v) {
+  union _type_cast_32_b u;
   u._long3 = v;
-  return u._ulong3;
+  return u._ushort16;
 }
 
 INLINE OVERLOADABLE double3 as_double3(long3 v) {
-  union _type_cast_24_b u;
+  union _type_cast_32_b u;
   u._long3 = v;
   return u._double3;
 }
 
-INLINE OVERLOADABLE long3 as_long3(ulong3 v) {
-  union _type_cast_24_b u;
-  u._ulong3 = v;
-  return u._long3;
-}
-
-INLINE OVERLOADABLE double3 as_double3(ulong3 v) {
-  union _type_cast_24_b u;
-  u._ulong3 = v;
-  return u._double3;
+INLINE OVERLOADABLE double4 as_double4(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._double4;
 }
 
-INLINE OVERLOADABLE long3 as_long3(double3 v) {
-  union _type_cast_24_b u;
-  u._double3 = v;
-  return u._long3;
+INLINE OVERLOADABLE float8 as_float8(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._float8;
 }
 
-INLINE OVERLOADABLE ulong3 as_ulong3(double3 v) {
-  union _type_cast_24_b u;
-  u._double3 = v;
+INLINE OVERLOADABLE ulong3 as_ulong3(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
   return u._ulong3;
 }
 
-union _type_cast_32_b {
-  long4 _long4;
-  ulong4 _ulong4;
-  int8 _int8;
-  uint8 _uint8;
-  short16 _short16;
-  ushort16 _ushort16;
-  double4 _double4;
-  float8 _float8;
-};
-
 INLINE OVERLOADABLE ulong4 as_ulong4(long4 v) {
   union _type_cast_32_b u;
   u._long4 = v;
@@ -1623,6 +2319,12 @@ INLINE OVERLOADABLE ushort16 as_ushort16(long4 v) {
   return u._ushort16;
 }
 
+INLINE OVERLOADABLE double3 as_double3(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._double3;
+}
+
 INLINE OVERLOADABLE double4 as_double4(long4 v) {
   union _type_cast_32_b u;
   u._long4 = v;
@@ -1635,6 +2337,66 @@ INLINE OVERLOADABLE float8 as_float8(long4 v) {
   return u._float8;
 }
 
+INLINE OVERLOADABLE long3 as_long3(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._long3;
+}
+
 INLINE OVERLOADABLE long4 as_long4(ulong4 v) {
   union _type_cast_32_b u;
   u._ulong4 = v;
@@ -1665,6 +2427,12 @@ INLINE OVERLOADABLE ushort16 as_ushort16(ulong4 v) {
   return u._ushort16;
 }
 
+INLINE OVERLOADABLE double3 as_double3(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._double3;
+}
+
 INLINE OVERLOADABLE double4 as_double4(ulong4 v) {
   union _type_cast_32_b u;
   u._ulong4 = v;
@@ -1677,12 +2445,24 @@ INLINE OVERLOADABLE float8 as_float8(ulong4 v) {
   return u._float8;
 }
 
+INLINE OVERLOADABLE long3 as_long3(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._long3;
+}
+
 INLINE OVERLOADABLE long4 as_long4(int8 v) {
   union _type_cast_32_b u;
   u._int8 = v;
   return u._long4;
 }
 
+INLINE OVERLOADABLE ulong3 as_ulong3(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._ulong3;
+}
+
 INLINE OVERLOADABLE ulong4 as_ulong4(int8 v) {
   union _type_cast_32_b u;
   u._int8 = v;
@@ -1707,6 +2487,12 @@ INLINE OVERLOADABLE ushort16 as_ushort16(int8 v) {
   return u._ushort16;
 }
 
+INLINE OVERLOADABLE double3 as_double3(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._double3;
+}
+
 INLINE OVERLOADABLE double4 as_double4(int8 v) {
   union _type_cast_32_b u;
   u._int8 = v;
@@ -1719,12 +2505,24 @@ INLINE OVERLOADABLE float8 as_float8(int8 v) {
   return u._float8;
 }
 
+INLINE OVERLOADABLE long3 as_long3(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._long3;
+}
+
 INLINE OVERLOADABLE long4 as_long4(uint8 v) {
   union _type_cast_32_b u;
   u._uint8 = v;
   return u._long4;
 }
 
+INLINE OVERLOADABLE ulong3 as_ulong3(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._ulong3;
+}
+
 INLINE OVERLOADABLE ulong4 as_ulong4(uint8 v) {
   union _type_cast_32_b u;
   u._uint8 = v;
@@ -1749,6 +2547,12 @@ INLINE OVERLOADABLE ushort16 as_ushort16(uint8 v) {
   return u._ushort16;
 }
 
+INLINE OVERLOADABLE double3 as_double3(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._double3;
+}
+
 INLINE OVERLOADABLE double4 as_double4(uint8 v) {
   union _type_cast_32_b u;
   u._uint8 = v;
@@ -1761,12 +2565,24 @@ INLINE OVERLOADABLE float8 as_float8(uint8 v) {
   return u._float8;
 }
 
+INLINE OVERLOADABLE long3 as_long3(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._long3;
+}
+
 INLINE OVERLOADABLE long4 as_long4(short16 v) {
   union _type_cast_32_b u;
   u._short16 = v;
   return u._long4;
 }
 
+INLINE OVERLOADABLE ulong3 as_ulong3(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._ulong3;
+}
+
 INLINE OVERLOADABLE ulong4 as_ulong4(short16 v) {
   union _type_cast_32_b u;
   u._short16 = v;
@@ -1791,6 +2607,12 @@ INLINE OVERLOADABLE ushort16 as_ushort16(short16 v) {
   return u._ushort16;
 }
 
+INLINE OVERLOADABLE double3 as_double3(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._double3;
+}
+
 INLINE OVERLOADABLE double4 as_double4(short16 v) {
   union _type_cast_32_b u;
   u._short16 = v;
@@ -1803,12 +2625,24 @@ INLINE OVERLOADABLE float8 as_float8(short16 v) {
   return u._float8;
 }
 
+INLINE OVERLOADABLE long3 as_long3(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._long3;
+}
+
 INLINE OVERLOADABLE long4 as_long4(ushort16 v) {
   union _type_cast_32_b u;
   u._ushort16 = v;
   return u._long4;
 }
 
+INLINE OVERLOADABLE ulong3 as_ulong3(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._ulong3;
+}
+
 INLINE OVERLOADABLE ulong4 as_ulong4(ushort16 v) {
   union _type_cast_32_b u;
   u._ushort16 = v;
@@ -1833,6 +2667,12 @@ INLINE OVERLOADABLE short16 as_short16(ushort16 v) {
   return u._short16;
 }
 
+INLINE OVERLOADABLE double3 as_double3(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._double3;
+}
+
 INLINE OVERLOADABLE double4 as_double4(ushort16 v) {
   union _type_cast_32_b u;
   u._ushort16 = v;
@@ -1845,12 +2685,78 @@ INLINE OVERLOADABLE float8 as_float8(ushort16 v) {
   return u._float8;
 }
 
+INLINE OVERLOADABLE long3 as_long3(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE float8 as_float8(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._long3;
+}
+
 INLINE OVERLOADABLE long4 as_long4(double4 v) {
   union _type_cast_32_b u;
   u._double4 = v;
   return u._long4;
 }
 
+INLINE OVERLOADABLE ulong3 as_ulong3(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._ulong3;
+}
+
 INLINE OVERLOADABLE ulong4 as_ulong4(double4 v) {
   union _type_cast_32_b u;
   u._double4 = v;
@@ -1887,12 +2793,24 @@ INLINE OVERLOADABLE float8 as_float8(double4 v) {
   return u._float8;
 }
 
+INLINE OVERLOADABLE long3 as_long3(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._long3;
+}
+
 INLINE OVERLOADABLE long4 as_long4(float8 v) {
   union _type_cast_32_b u;
   u._float8 = v;
   return u._long4;
 }
 
+INLINE OVERLOADABLE ulong3 as_ulong3(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._ulong3;
+}
+
 INLINE OVERLOADABLE ulong4 as_ulong4(float8 v) {
   union _type_cast_32_b u;
   u._float8 = v;
@@ -1923,6 +2841,12 @@ INLINE OVERLOADABLE ushort16 as_ushort16(float8 v) {
   return u._ushort16;
 }
 
+INLINE OVERLOADABLE double3 as_double3(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._double3;
+}
+
 INLINE OVERLOADABLE double4 as_double4(float8 v) {
   union _type_cast_32_b u;
   u._float8 = v;
@@ -2159,3 +3083,4 @@ INLINE OVERLOADABLE ulong16 as_ulong16(double16 v) {
   u._double16 = v;
   return u._ulong16;
 }
+
diff --git a/backend/src/ocl_barrier.ll b/backend/src/ocl_barrier.ll
new file mode 100644
index 0000000..9f46347
--- /dev/null
+++ b/backend/src/ocl_barrier.ll
@@ -0,0 +1,39 @@
+;XXX FIXME as llvm can't use macros, we hardcoded 3, 1, 2
+;here, we may need to use a more grace way to handle this type
+;of values latter.
+;#define CLK_LOCAL_MEM_FENCE  (1 << 0)
+;#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+declare i32 @_get_local_mem_fence() nounwind alwaysinline
+declare i32 @_get_global_mem_fence() nounwind alwaysinline
+declare void @__gen_ocl_barrier_local() nounwind alwaysinline
+declare void @__gen_ocl_barrier_global() nounwind alwaysinline
+declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline
+
+define void @barrier(i32 %flags) nounwind noduplicate alwaysinline {
+  %1 = icmp eq i32 %flags, 3
+  br i1 %1, label %barrier_local_global, label %barrier_local_check
+
+barrier_local_global:
+  call void @__gen_ocl_barrier_local_and_global()
+  br label %done
+
+barrier_local_check:
+  %2 = icmp eq i32 %flags, 1
+  br i1 %2, label %barrier_local, label %barrier_global_check
+
+barrier_local:
+  call void @__gen_ocl_barrier_local()
+  br label %done
+
+barrier_global_check:
+  %3 = icmp eq i32 %flags, 2
+  br i1 %3, label %barrier_global, label %done
+
+barrier_global:
+  call void @__gen_ocl_barrier_global()
+  br label %done
+
+done:
+  ret void
+}
diff --git a/backend/src/ocl_convert.h b/backend/src/ocl_convert.h
index a667bee..7ec2aec 100644
--- a/backend/src/ocl_convert.h
+++ b/backend/src/ocl_convert.h
@@ -1,5 +1,9 @@
 // This file is autogenerated by gen_convert.sh.
 // Don't modify it manually.
+INLINE OVERLOADABLE long convert_long(long v) {
+  return (long)v;
+}
+
 INLINE OVERLOADABLE ulong convert_ulong(long v) {
   return (ulong)v;
 }
@@ -40,6 +44,10 @@ INLINE OVERLOADABLE long convert_long(ulong v) {
   return (long)v;
 }
 
+INLINE OVERLOADABLE ulong convert_ulong(ulong v) {
+  return (ulong)v;
+}
+
 INLINE OVERLOADABLE int convert_int(ulong v) {
   return (int)v;
 }
@@ -80,6 +88,10 @@ INLINE OVERLOADABLE ulong convert_ulong(int v) {
   return (ulong)v;
 }
 
+INLINE OVERLOADABLE int convert_int(int v) {
+  return (int)v;
+}
+
 INLINE OVERLOADABLE uint convert_uint(int v) {
   return (uint)v;
 }
@@ -120,6 +132,10 @@ INLINE OVERLOADABLE int convert_int(uint v) {
   return (int)v;
 }
 
+INLINE OVERLOADABLE uint convert_uint(uint v) {
+  return (uint)v;
+}
+
 INLINE OVERLOADABLE short convert_short(uint v) {
   return (short)v;
 }
@@ -160,6 +176,10 @@ INLINE OVERLOADABLE uint convert_uint(short v) {
   return (uint)v;
 }
 
+INLINE OVERLOADABLE short convert_short(short v) {
+  return (short)v;
+}
+
 INLINE OVERLOADABLE ushort convert_ushort(short v) {
   return (ushort)v;
 }
@@ -200,6 +220,10 @@ INLINE OVERLOADABLE short convert_short(ushort v) {
   return (short)v;
 }
 
+INLINE OVERLOADABLE ushort convert_ushort(ushort v) {
+  return (ushort)v;
+}
+
 INLINE OVERLOADABLE char convert_char(ushort v) {
   return (char)v;
 }
@@ -240,6 +264,10 @@ INLINE OVERLOADABLE ushort convert_ushort(char v) {
   return (ushort)v;
 }
 
+INLINE OVERLOADABLE char convert_char(char v) {
+  return (char)v;
+}
+
 INLINE OVERLOADABLE uchar convert_uchar(char v) {
   return (uchar)v;
 }
@@ -280,6 +308,10 @@ INLINE OVERLOADABLE char convert_char(uchar v) {
   return (char)v;
 }
 
+INLINE OVERLOADABLE uchar convert_uchar(uchar v) {
+  return (uchar)v;
+}
+
 INLINE OVERLOADABLE double convert_double(uchar v) {
   return (double)v;
 }
@@ -320,6 +352,10 @@ INLINE OVERLOADABLE uchar convert_uchar(double v) {
   return (uchar)v;
 }
 
+INLINE OVERLOADABLE double convert_double(double v) {
+  return (double)v;
+}
+
 INLINE OVERLOADABLE float convert_float(double v) {
   return (float)v;
 }
@@ -360,6 +396,10 @@ INLINE OVERLOADABLE double convert_double(float v) {
   return (double)v;
 }
 
+INLINE OVERLOADABLE float convert_float(float v) {
+  return (float)v;
+}
+
 INLINE OVERLOADABLE long2 convert_long2(long2 v) { return v; }
 INLINE OVERLOADABLE ulong2 convert_ulong2(long2 v) {
   return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
@@ -2270,9 +2310,18 @@ INLINE_OVERLOADABLE long convert_long_sat(ulong x) {
   return x > MAX ? MAX : x;
 }
 
-INLINE_OVERLOADABLE ulong convert_ulong_sat(long x) {
-  return x < 0 ? 0 : x;
-}
+#define DEF(DSTTYPE, SRCTYPE) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x < 0 ? 0 : x; \
+  }
+DEF(ushort, char);
+DEF(uint, char);
+DEF(uint, short);
+DEF(ulong, char);
+DEF(ulong, short);
+DEF(ulong, int);
+DEF(ulong, long);
+#undef DEF
 
 #define DEF(DSTTYPE, SRCTYPE) \
   INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
@@ -2283,7 +2332,6 @@ DEF(uchar, uchar);
 DEF(short, char);
 DEF(short, uchar);
 DEF(short, short);
-DEF(ushort, char);
 DEF(ushort, uchar);
 DEF(ushort, ushort);
 DEF(int, char);
@@ -2291,9 +2339,7 @@ DEF(int, uchar);
 DEF(int, short);
 DEF(int, ushort);
 DEF(int, int);
-DEF(uint, char);
 DEF(uint, uchar);
-DEF(uint, short);
 DEF(uint, ushort);
 DEF(uint, uint);
 DEF(long, char);
@@ -2303,11 +2349,8 @@ DEF(long, ushort);
 DEF(long, int);
 DEF(long, uint);
 DEF(long, long);
-DEF(ulong, char);
 DEF(ulong, uchar);
-DEF(ulong, short);
 DEF(ulong, ushort);
-DEF(ulong, int);
 DEF(ulong, uint);
 DEF(ulong, ulong);
 #undef DEF
@@ -3752,3 +3795,13621 @@ INLINE OVERLOADABLE uchar16 convert_uchar16_sat(float16 v) {
   return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
 }
 
+
+float __gen_ocl_rndz(float x);
+float __gen_ocl_rnde(float x);
+float __gen_ocl_rndu(float x);
+float __gen_ocl_rndd(float x);
+INLINE_OVERLOADABLE float __convert_float_rtz(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;
+  if((l > x && x > 0) || x >= 0x7fffffc000000000 ||
+     (l < x && x < 0)) {
+      u.u -= 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;  //can not use u.f < x
+  if(l < x && x < 0x7fffffc000000000) {
+    if(x > 0)
+      u.u = u.u + 1;
+    else
+      u.u = u.u - 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;  //avoid overflow
+  if(l > x || x >= 0x7fffffc000000000) {
+    if(x > 0)
+      u.u = u.u - 1;
+    else
+      u.u = u.u + 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(ulong x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong l = u.f;
+  if(l > x  || x >= 0xffffff8000000000)
+      u.u -= 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(ulong x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong l = u.f;  //can not use u.f < x
+  if(l < x && x < 0xffffff8000000000)
+    u.u = u.u + 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(ulong x)
+{
+  return __convert_float_rtz(x);
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long i = u.f;
+  if((i > x && x > 0) ||
+     (i < x && x < 0)) {
+      u.u -= 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  int i = u.f;
+  if(i < x) {
+    if(x > 0)
+      u.u += 1;
+    else
+      u.u -= 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long i = u.f;  //avoid overflow
+  if(i > x) {
+    if(x > 0)
+      u.u = u.u - 1;
+    else
+      u.u = u.u + 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(uint x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong i = u.f;
+  if(i > x)
+    u.u -= 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(uint x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  uint i = u.f;
+  if(i < x)
+    u.u += 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(uint x)
+{
+  return __convert_float_rtz(x);
+}
+
+INLINE_OVERLOADABLE long convert_long_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(long x)
+{ return __convert_float_rtz(x); }
+INLINE_OVERLOADABLE float convert_float_rtp(long x)
+{ return __convert_float_rtp(x); }
+INLINE_OVERLOADABLE float convert_float_rtn(long x)
+{ return __convert_float_rtn(x); }
+INLINE_OVERLOADABLE long convert_long_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(ulong x)
+{ return __convert_float_rtz(x); }
+INLINE_OVERLOADABLE float convert_float_rtp(ulong x)
+{ return __convert_float_rtp(x); }
+INLINE_OVERLOADABLE float convert_float_rtn(ulong x)
+{ return __convert_float_rtn(x); }
+INLINE_OVERLOADABLE long convert_long_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(int x)
+{ return __convert_float_rtz(x); }
+INLINE_OVERLOADABLE float convert_float_rtp(int x)
+{ return __convert_float_rtp(x); }
+INLINE_OVERLOADABLE float convert_float_rtn(int x)
+{ return __convert_float_rtn(x); }
+INLINE_OVERLOADABLE long convert_long_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(uint x)
+{ return __convert_float_rtz(x); }
+INLINE_OVERLOADABLE float convert_float_rtp(uint x)
+{ return __convert_float_rtp(x); }
+INLINE_OVERLOADABLE float convert_float_rtn(uint x)
+{ return __convert_float_rtn(x); }
+INLINE_OVERLOADABLE long convert_long_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE long convert_long_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE long convert_long_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE long convert_long_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE int convert_int_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE int convert_int_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE int convert_int_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE int convert_int_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE uint convert_uint_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE uint convert_uint_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE uint convert_uint_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE uint convert_uint_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE short convert_short_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE short convert_short_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE short convert_short_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE short convert_short_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE char convert_char_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE char convert_char_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE char convert_char_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE char convert_char_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE float convert_float_rte(float x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(float x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(float x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(float x)
+{ return x; }
+INLINE OVERLOADABLE long2 convert_long2_rte(long2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(long2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(long2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(long2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(long2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(long2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(long2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(long2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(long2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(long2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(long2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(long2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(long2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(long2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(long2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(long2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(long2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(long2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(long2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(long2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(long2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(long2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(long2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(long2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(long2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(long2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(long2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(long2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(long2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(long2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(long2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(long2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(long2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(long2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(long2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(long2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(ulong2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(ulong2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(ulong2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(ulong2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(ulong2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(ulong2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(ulong2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(ulong2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(ulong2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(ulong2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(ulong2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(ulong2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(ulong2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(ulong2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(ulong2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(ulong2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(ulong2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(ulong2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(ulong2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(ulong2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(ulong2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(ulong2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(ulong2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(ulong2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(ulong2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(ulong2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(ulong2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(ulong2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(ulong2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(ulong2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(ulong2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(ulong2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(ulong2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(ulong2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(ulong2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(ulong2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(int2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(int2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(int2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(int2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(int2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(int2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(int2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(int2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(int2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(int2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(int2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(int2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(int2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(int2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(int2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(int2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(int2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(int2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(int2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(int2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(int2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(int2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(int2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(int2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(int2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(int2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(int2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(int2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(int2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(int2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(int2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(int2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(int2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(int2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(int2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(int2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(uint2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(uint2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(uint2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(uint2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(uint2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(uint2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(uint2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(uint2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(uint2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(uint2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(uint2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(uint2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(uint2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(uint2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(uint2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(uint2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(uint2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(uint2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(uint2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(uint2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(uint2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(uint2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(uint2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(uint2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(uint2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(uint2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(uint2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(uint2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(uint2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(uint2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(uint2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(uint2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(uint2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(uint2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(uint2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(uint2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(short2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(short2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(short2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(short2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(short2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(short2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(short2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(short2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(short2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(short2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(short2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(short2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(short2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(short2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(short2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(short2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(short2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(short2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(short2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(short2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(short2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(short2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(short2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(short2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(short2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(short2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(short2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(short2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(short2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(short2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(short2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(short2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(short2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(short2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(short2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(short2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(ushort2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(ushort2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(ushort2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(ushort2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(ushort2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(ushort2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(ushort2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(ushort2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(ushort2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(ushort2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(ushort2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(ushort2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(ushort2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(ushort2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(ushort2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(ushort2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(ushort2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(ushort2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(ushort2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(ushort2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(ushort2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(ushort2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(ushort2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(ushort2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(ushort2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(ushort2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(ushort2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(ushort2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(ushort2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(ushort2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(ushort2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(ushort2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(ushort2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(ushort2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(ushort2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(ushort2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(char2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(char2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(char2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(char2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(char2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(char2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(char2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(char2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(char2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(char2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(char2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(char2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(char2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(char2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(char2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(char2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(char2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(char2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(char2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(char2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(char2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(char2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(char2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(char2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(char2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(char2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(char2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(char2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(char2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(char2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(char2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(char2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(char2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(char2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(char2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(char2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(uchar2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(uchar2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(uchar2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(uchar2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(uchar2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(uchar2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(uchar2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(uchar2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(uchar2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(uchar2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(uchar2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(uchar2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(uchar2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(uchar2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(uchar2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(uchar2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(uchar2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(uchar2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(uchar2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(uchar2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(uchar2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(uchar2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(uchar2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(uchar2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(uchar2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(uchar2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(uchar2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(uchar2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(uchar2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(uchar2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(uchar2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(uchar2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(uchar2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(uchar2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(uchar2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(uchar2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(float2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(float2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(float2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(float2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(float2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(float2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(float2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(float2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(float2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(float2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(float2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(float2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(float2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(float2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(float2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(float2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(float2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(float2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(float2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(float2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(float2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(float2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(float2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(float2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(float2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(float2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(float2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(float2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(float2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(float2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(float2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(float2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(float2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(float2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(float2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(float2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(long3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(long3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(long3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(long3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(long3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(long3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(long3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(long3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(long3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(long3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(long3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(long3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(long3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(long3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(long3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(long3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(long3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(long3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(long3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(long3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(long3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(long3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(long3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(long3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(long3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(long3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(long3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(long3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(long3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(long3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(long3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(long3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(long3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(long3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(long3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(long3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(ulong3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(ulong3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(ulong3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(ulong3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(ulong3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(ulong3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(ulong3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(ulong3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(ulong3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(ulong3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(ulong3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(ulong3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(ulong3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(ulong3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(ulong3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(ulong3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(ulong3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(ulong3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(ulong3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(ulong3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(ulong3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(ulong3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(ulong3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(ulong3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(ulong3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(ulong3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(ulong3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(ulong3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(ulong3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(ulong3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(ulong3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(ulong3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(ulong3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(ulong3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(ulong3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(ulong3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(int3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(int3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(int3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(int3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(int3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(int3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(int3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(int3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(int3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(int3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(int3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(int3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(int3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(int3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(int3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(int3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(int3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(int3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(int3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(int3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(int3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(int3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(int3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(int3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(int3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(int3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(int3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(int3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(int3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(int3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(int3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(int3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(int3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(int3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(int3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(int3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(uint3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(uint3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(uint3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(uint3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(uint3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(uint3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(uint3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(uint3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(uint3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(uint3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(uint3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(uint3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(uint3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(uint3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(uint3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(uint3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(uint3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(uint3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(uint3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(uint3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(uint3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(uint3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(uint3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(uint3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(uint3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(uint3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(uint3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(uint3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(uint3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(uint3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(uint3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(uint3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(uint3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(uint3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(uint3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(uint3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(short3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(short3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(short3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(short3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(short3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(short3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(short3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(short3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(short3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(short3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(short3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(short3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(short3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(short3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(short3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(short3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(short3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(short3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(short3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(short3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(short3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(short3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(short3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(short3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(short3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(short3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(short3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(short3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(short3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(short3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(short3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(short3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(short3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(short3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(short3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(short3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(ushort3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(ushort3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(ushort3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(ushort3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(ushort3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(ushort3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(ushort3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(ushort3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(ushort3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(ushort3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(ushort3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(ushort3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(ushort3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(ushort3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(ushort3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(ushort3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(ushort3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(ushort3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(ushort3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(ushort3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(ushort3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(ushort3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(ushort3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(ushort3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(ushort3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(ushort3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(ushort3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(ushort3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(ushort3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(ushort3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(ushort3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(ushort3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(ushort3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(ushort3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(ushort3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(ushort3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(char3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(char3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(char3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(char3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(char3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(char3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(char3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(char3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(char3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(char3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(char3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(char3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(char3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(char3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(char3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(char3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(char3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(char3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(char3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(char3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(char3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(char3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(char3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(char3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(char3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(char3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(char3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(char3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(char3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(char3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(char3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(char3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(char3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(char3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(char3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(char3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(uchar3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(uchar3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(uchar3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(uchar3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(uchar3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(uchar3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(uchar3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(uchar3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(uchar3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(uchar3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(uchar3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(uchar3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(uchar3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(uchar3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(uchar3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(uchar3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(uchar3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(uchar3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(uchar3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(uchar3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(uchar3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(uchar3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(uchar3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(uchar3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(uchar3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(uchar3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(uchar3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(uchar3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(uchar3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(uchar3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(uchar3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(uchar3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(uchar3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(uchar3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(uchar3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(uchar3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(float3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(float3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(float3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(float3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(float3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(float3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(float3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(float3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(float3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(float3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(float3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(float3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(float3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(float3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(float3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(float3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(float3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(float3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(float3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(float3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(float3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(float3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(float3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(float3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(float3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(float3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(float3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(float3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(float3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(float3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(float3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(float3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(float3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(float3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(float3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(float3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(long4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(long4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(long4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(long4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(long4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(long4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(long4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(long4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(long4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(long4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(long4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(long4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(long4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(long4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(long4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(long4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(long4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(long4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(long4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(long4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(long4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(long4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(long4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(long4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(long4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(long4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(long4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(long4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(long4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(long4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(long4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(long4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(long4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(long4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(long4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(long4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(ulong4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(ulong4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(ulong4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(ulong4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(ulong4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(ulong4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(ulong4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(ulong4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(ulong4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(ulong4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(ulong4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(ulong4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(ulong4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(ulong4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(ulong4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(ulong4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(ulong4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(ulong4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(ulong4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(ulong4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(ulong4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(ulong4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(ulong4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(ulong4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(ulong4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(ulong4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(ulong4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(ulong4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(ulong4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(ulong4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(ulong4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(ulong4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(ulong4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(ulong4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(ulong4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(ulong4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(int4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(int4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(int4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(int4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(int4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(int4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(int4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(int4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(int4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(int4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(int4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(int4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(int4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(int4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(int4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(int4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(int4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(int4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(int4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(int4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(int4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(int4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(int4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(int4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(int4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(int4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(int4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(int4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(int4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(int4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(int4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(int4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(int4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(int4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(int4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(int4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(uint4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(uint4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(uint4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(uint4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(uint4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(uint4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(uint4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(uint4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(uint4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(uint4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(uint4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(uint4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(uint4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(uint4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(uint4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(uint4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(uint4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(uint4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(uint4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(uint4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(uint4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(uint4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(uint4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(uint4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(uint4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(uint4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(uint4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(uint4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(uint4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(uint4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(uint4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(uint4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(uint4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(uint4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(uint4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(uint4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(short4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(short4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(short4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(short4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(short4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(short4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(short4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(short4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(short4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(short4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(short4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(short4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(short4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(short4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(short4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(short4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(short4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(short4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(short4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(short4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(short4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(short4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(short4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(short4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(short4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(short4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(short4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(short4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(short4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(short4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(short4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(short4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(short4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(short4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(short4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(short4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(ushort4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(ushort4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(ushort4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(ushort4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(ushort4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(ushort4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(ushort4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(ushort4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(ushort4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(ushort4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(ushort4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(ushort4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(ushort4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(ushort4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(ushort4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(ushort4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(ushort4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(ushort4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(ushort4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(ushort4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(ushort4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(ushort4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(ushort4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(ushort4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(ushort4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(ushort4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(ushort4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(ushort4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(ushort4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(ushort4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(ushort4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(ushort4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(ushort4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(ushort4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(ushort4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(ushort4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(char4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(char4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(char4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(char4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(char4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(char4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(char4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(char4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(char4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(char4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(char4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(char4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(char4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(char4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(char4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(char4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(char4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(char4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(char4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(char4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(char4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(char4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(char4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(char4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(char4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(char4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(char4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(char4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(char4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(char4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(char4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(char4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(char4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(char4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(char4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(char4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(uchar4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(uchar4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(uchar4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(uchar4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(uchar4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(uchar4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(uchar4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(uchar4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(uchar4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(uchar4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(uchar4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(uchar4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(uchar4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(uchar4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(uchar4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(uchar4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(uchar4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(uchar4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(uchar4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(uchar4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(uchar4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(uchar4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(uchar4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(uchar4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(uchar4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(uchar4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(uchar4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(uchar4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(uchar4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(uchar4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(uchar4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(uchar4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(uchar4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(uchar4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(uchar4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(uchar4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(float4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(float4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(float4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(float4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(float4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(float4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(float4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(float4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(float4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(float4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(float4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(float4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(float4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(float4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(float4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(float4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(float4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(float4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(float4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(float4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(float4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(float4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(float4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(float4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(float4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(float4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(float4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(float4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(float4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(float4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(float4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(float4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(float4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(float4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(float4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(float4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(long8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(long8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(long8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(long8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(long8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(long8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(long8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(long8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(long8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(long8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(long8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(long8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(long8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(long8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(long8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(long8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(long8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(long8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(long8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(long8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(long8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(long8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(long8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(long8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(long8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(long8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(long8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(long8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(long8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(long8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(long8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(long8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(long8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(long8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(long8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(long8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(ulong8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(ulong8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(ulong8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(ulong8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(ulong8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(ulong8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(ulong8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(ulong8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(ulong8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(ulong8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(ulong8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(ulong8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(ulong8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(ulong8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(ulong8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(ulong8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(ulong8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(ulong8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(ulong8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(ulong8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(ulong8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(ulong8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(ulong8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(ulong8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(ulong8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(ulong8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(ulong8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(ulong8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(ulong8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(ulong8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(ulong8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(ulong8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(ulong8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(ulong8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(ulong8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(ulong8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(int8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(int8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(int8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(int8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(int8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(int8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(int8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(int8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(int8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(int8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(int8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(int8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(int8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(int8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(int8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(int8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(int8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(int8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(int8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(int8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(int8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(int8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(int8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(int8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(int8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(int8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(int8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(int8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(int8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(int8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(int8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(int8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(int8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(int8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(int8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(int8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(uint8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(uint8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(uint8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(uint8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(uint8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(uint8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(uint8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(uint8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(uint8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(uint8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(uint8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(uint8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(uint8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(uint8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(uint8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(uint8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(uint8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(uint8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(uint8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(uint8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(uint8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(uint8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(uint8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(uint8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(uint8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(uint8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(uint8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(uint8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(uint8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(uint8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(uint8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(uint8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(uint8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(uint8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(uint8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(uint8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(short8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(short8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(short8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(short8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(short8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(short8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(short8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(short8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(short8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(short8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(short8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(short8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(short8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(short8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(short8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(short8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(short8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(short8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(short8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(short8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(short8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(short8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(short8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(short8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(short8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(short8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(short8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(short8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(short8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(short8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(short8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(short8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(short8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(short8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(short8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(short8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(ushort8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(ushort8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(ushort8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(ushort8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(ushort8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(ushort8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(ushort8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(ushort8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(ushort8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(ushort8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(ushort8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(ushort8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(ushort8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(ushort8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(ushort8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(ushort8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(ushort8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(ushort8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(ushort8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(ushort8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(ushort8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(ushort8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(ushort8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(ushort8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(ushort8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(ushort8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(ushort8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(ushort8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(ushort8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(ushort8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(ushort8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(ushort8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(ushort8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(ushort8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(ushort8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(ushort8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(char8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(char8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(char8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(char8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(char8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(char8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(char8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(char8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(char8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(char8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(char8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(char8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(char8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(char8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(char8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(char8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(char8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(char8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(char8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(char8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(char8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(char8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(char8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(char8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(char8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(char8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(char8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(char8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(char8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(char8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(char8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(char8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(char8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(char8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(char8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(char8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(uchar8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(uchar8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(uchar8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(uchar8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(uchar8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(uchar8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(uchar8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(uchar8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(uchar8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(uchar8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(uchar8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(uchar8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(uchar8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(uchar8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(uchar8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(uchar8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(uchar8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(uchar8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(uchar8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(uchar8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(uchar8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(uchar8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(uchar8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(uchar8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(uchar8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(uchar8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(uchar8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(uchar8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(uchar8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(uchar8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(uchar8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(uchar8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(uchar8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(uchar8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(uchar8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(uchar8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(float8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(float8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(float8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(float8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(float8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(float8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(float8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(float8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(float8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(float8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(float8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(float8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(float8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(float8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(float8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(float8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(float8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(float8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(float8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(float8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(float8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(float8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(float8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(float8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(float8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(float8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(float8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(float8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(float8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(float8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(float8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(float8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(float8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(float8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(float8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(float8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(long16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(long16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(long16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(long16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(long16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(long16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(long16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(long16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(long16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(long16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(long16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(long16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(long16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(long16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(long16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(long16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(long16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(long16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(long16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(long16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(long16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(long16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(long16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(long16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(long16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(long16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(long16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(long16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(long16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(long16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(long16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(long16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(long16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(long16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(long16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(long16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(ulong16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(ulong16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(ulong16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(ulong16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(ulong16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(ulong16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(ulong16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(ulong16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(ulong16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(ulong16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(ulong16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(ulong16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(ulong16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(ulong16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(ulong16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(ulong16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(ulong16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(ulong16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(ulong16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(ulong16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(ulong16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(ulong16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(ulong16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(ulong16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(ulong16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(ulong16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(ulong16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(ulong16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(ulong16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(ulong16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(ulong16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(ulong16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(ulong16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(ulong16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(ulong16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(ulong16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(int16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(int16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(int16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(int16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(int16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(int16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(int16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(int16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(int16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(int16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(int16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(int16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(int16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(int16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(int16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(int16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(int16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(int16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(int16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(int16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(int16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(int16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(int16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(int16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(int16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(int16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(int16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(int16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(int16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(int16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(int16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(int16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(int16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(int16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(int16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(int16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(uint16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(uint16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(uint16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(uint16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(uint16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(uint16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(uint16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(uint16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(uint16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(uint16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(uint16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(uint16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(uint16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(uint16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(uint16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(uint16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(uint16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(uint16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(uint16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(uint16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(uint16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(uint16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(uint16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(uint16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(uint16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(uint16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(uint16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(uint16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(uint16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(uint16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(uint16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(uint16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(uint16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(uint16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(uint16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(uint16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(short16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(short16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(short16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(short16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(short16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(short16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(short16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(short16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(short16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(short16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(short16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(short16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(short16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(short16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(short16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(short16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(short16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(short16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(short16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(short16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(short16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(short16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(short16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(short16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(short16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(short16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(short16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(short16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(short16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(short16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(short16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(short16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(short16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(short16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(short16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(short16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(ushort16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(ushort16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(ushort16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(ushort16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(ushort16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(ushort16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(ushort16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(ushort16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(ushort16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(ushort16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(ushort16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(ushort16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(ushort16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(ushort16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(ushort16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(ushort16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(ushort16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(ushort16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(ushort16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(ushort16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(ushort16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(ushort16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(ushort16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(ushort16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(ushort16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(ushort16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(ushort16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(ushort16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(ushort16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(ushort16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(ushort16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(ushort16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(ushort16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(ushort16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(ushort16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(ushort16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(char16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(char16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(char16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(char16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(char16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(char16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(char16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(char16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(char16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(char16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(char16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(char16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(char16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(char16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(char16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(char16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(char16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(char16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(char16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(char16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(char16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(char16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(char16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(char16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(char16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(char16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(char16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(char16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(char16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(char16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(char16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(char16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(char16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(char16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(char16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(char16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(uchar16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(uchar16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(uchar16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(uchar16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(uchar16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(uchar16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(uchar16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(uchar16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(uchar16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(uchar16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(uchar16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(uchar16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(uchar16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(uchar16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(uchar16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(uchar16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(uchar16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(uchar16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(uchar16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(uchar16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(uchar16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(uchar16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(uchar16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(uchar16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(uchar16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(uchar16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(uchar16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(uchar16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(uchar16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(uchar16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(uchar16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(uchar16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(uchar16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(uchar16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(uchar16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(uchar16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(float16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(float16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(float16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(float16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(float16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(float16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(float16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(float16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(float16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(float16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(float16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(float16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(float16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(float16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(float16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(float16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(float16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(float16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(float16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(float16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(float16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(float16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(float16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(float16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(float16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(float16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(float16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(float16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(float16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(float16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(float16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(float16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(float16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(float16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(float16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(float16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE_OVERLOADABLE long convert_long_sat_rte(long x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(long x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(long x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(long x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(long x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(long x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(long x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(long x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(long x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(long x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(long x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(long x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(long x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(long x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(long x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(long x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(long x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(long x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(long x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(long x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(long x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(long x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(long x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(long x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(long x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(long x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(long x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(long x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(long x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(long x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(long x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(long x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(ulong x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(ulong x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(ulong x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(ulong x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(ulong x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(ulong x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(ulong x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(ulong x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(ulong x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(ulong x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(ulong x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(ulong x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(ulong x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(ulong x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(ulong x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(ulong x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(ulong x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(ulong x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(ulong x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(ulong x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(ulong x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(ulong x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(ulong x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(ulong x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(ulong x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(ulong x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(ulong x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(ulong x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(ulong x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(ulong x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(ulong x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(ulong x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(int x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(int x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(int x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(int x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(int x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(int x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(int x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(int x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(int x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(int x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(int x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(int x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(int x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(int x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(int x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(int x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(int x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(int x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(int x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(int x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(int x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(int x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(int x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(int x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(int x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(int x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(int x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(int x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(int x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(int x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(int x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(int x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(uint x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(uint x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(uint x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(uint x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(uint x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(uint x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(uint x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(uint x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(uint x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(uint x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(uint x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(uint x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(uint x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(uint x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(uint x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(uint x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(uint x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(uint x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(uint x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(uint x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(uint x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(uint x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(uint x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(uint x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(uint x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(uint x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(uint x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(uint x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(uint x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(uint x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(uint x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(uint x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(short x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(short x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(short x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(short x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(short x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(short x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(short x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(short x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(short x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(short x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(short x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(short x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(short x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(short x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(short x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(short x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(short x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(short x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(short x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(short x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(short x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(short x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(short x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(short x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(short x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(short x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(short x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(short x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(short x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(short x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(short x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(short x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(ushort x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(ushort x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(ushort x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(ushort x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(ushort x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(ushort x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(ushort x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(ushort x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(ushort x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(ushort x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(ushort x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(ushort x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(ushort x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(ushort x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(ushort x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(ushort x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(ushort x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(ushort x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(ushort x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(ushort x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(ushort x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(ushort x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(ushort x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(ushort x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(ushort x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(ushort x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(ushort x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(ushort x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(ushort x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(ushort x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(ushort x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(ushort x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(char x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(char x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(char x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(char x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(char x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(char x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(char x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(char x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(char x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(char x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(char x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(char x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(char x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(char x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(char x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(char x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(char x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(char x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(char x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(char x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(char x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(char x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(char x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(char x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(char x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(char x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(char x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(char x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(char x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(char x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(char x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(char x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(uchar x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(uchar x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(uchar x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(uchar x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(uchar x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(uchar x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(uchar x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(uchar x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(uchar x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(uchar x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(uchar x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(uchar x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(uchar x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(uchar x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(uchar x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(uchar x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(uchar x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(uchar x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(uchar x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(uchar x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(uchar x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(uchar x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(uchar x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(uchar x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(uchar x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(uchar x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(uchar x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(uchar x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(uchar x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(uchar x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(uchar x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(uchar x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(float x)
+{ return convert_long_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(float x)
+{ return convert_long_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(float x)
+{ return convert_long_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(float x)
+{ return convert_long_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(float x)
+{ return convert_ulong_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(float x)
+{ return convert_ulong_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(float x)
+{ return convert_ulong_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(float x)
+{ return convert_ulong_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(float x)
+{ return convert_int_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(float x)
+{ return convert_int_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(float x)
+{ return convert_int_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(float x)
+{ return convert_int_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(float x)
+{ return convert_uint_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(float x)
+{ return convert_uint_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(float x)
+{ return convert_uint_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(float x)
+{ return convert_uint_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(float x)
+{ return convert_short_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(float x)
+{ return convert_short_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(float x)
+{ return convert_short_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(float x)
+{ return convert_short_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(float x)
+{ return convert_ushort_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(float x)
+{ return convert_ushort_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(float x)
+{ return convert_ushort_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(float x)
+{ return convert_ushort_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(float x)
+{ return convert_char_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(float x)
+{ return convert_char_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(float x)
+{ return convert_char_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(float x)
+{ return convert_char_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(float x)
+{ return convert_uchar_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(float x)
+{ return convert_uchar_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(float x)
+{ return convert_uchar_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(float x)
+{ return convert_uchar_sat(__gen_ocl_rndd(x)); }
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(long2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(long2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(long2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(long2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(long2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(long2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(long2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(long2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(long2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(long2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(long2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(long2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(long2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(long2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(long2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(long2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(long2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(long2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(long2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(long2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(long2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(long2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(long2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(long2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(long2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(long2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(long2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(long2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(long2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(long2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(long2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(long2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(ulong2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(ulong2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(ulong2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(ulong2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(ulong2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(ulong2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(ulong2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(ulong2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(ulong2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(ulong2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(ulong2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(ulong2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(ulong2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(ulong2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(ulong2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(ulong2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(ulong2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(ulong2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(ulong2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(ulong2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(ulong2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(ulong2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(ulong2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(ulong2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(ulong2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(ulong2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(ulong2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(ulong2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(ulong2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(ulong2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(ulong2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(ulong2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(int2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(int2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(int2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(int2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(int2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(int2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(int2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(int2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(int2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(int2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(int2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(int2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(int2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(int2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(int2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(int2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(int2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(int2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(int2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(int2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(int2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(int2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(int2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(int2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(int2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(int2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(int2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(int2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(int2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(int2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(int2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(int2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(uint2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(uint2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(uint2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(uint2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(uint2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(uint2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(uint2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(uint2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(uint2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(uint2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(uint2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(uint2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(uint2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(uint2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(uint2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(uint2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(uint2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(uint2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(uint2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(uint2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(uint2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(uint2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(uint2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(uint2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(uint2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(uint2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(uint2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(uint2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(uint2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(uint2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(uint2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(uint2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(short2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(short2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(short2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(short2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(short2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(short2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(short2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(short2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(short2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(short2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(short2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(short2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(short2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(short2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(short2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(short2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(short2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(short2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(short2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(short2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(short2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(short2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(short2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(short2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(short2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(short2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(short2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(short2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(short2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(short2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(short2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(short2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(ushort2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(ushort2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(ushort2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(ushort2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(ushort2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(ushort2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(ushort2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(ushort2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(ushort2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(ushort2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(ushort2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(ushort2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(ushort2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(ushort2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(ushort2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(ushort2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(ushort2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(ushort2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(ushort2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(ushort2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(ushort2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(ushort2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(ushort2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(ushort2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(ushort2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(ushort2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(ushort2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(ushort2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(ushort2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(ushort2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(ushort2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(ushort2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(char2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(char2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(char2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(char2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(char2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(char2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(char2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(char2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(char2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(char2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(char2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(char2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(char2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(char2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(char2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(char2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(char2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(char2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(char2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(char2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(char2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(char2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(char2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(char2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(char2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(char2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(char2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(char2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(char2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(char2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(char2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(char2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(uchar2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(uchar2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(uchar2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(uchar2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(uchar2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(uchar2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(uchar2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(uchar2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(uchar2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(uchar2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(uchar2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(uchar2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(uchar2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(uchar2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(uchar2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(uchar2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(uchar2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(uchar2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(uchar2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(uchar2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(uchar2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(uchar2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(uchar2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(uchar2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(uchar2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(uchar2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(uchar2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(uchar2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(uchar2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(uchar2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(uchar2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(uchar2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(float2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(float2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(float2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(float2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(float2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(float2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(float2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(float2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(float2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(float2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(float2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(float2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(float2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(float2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(float2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(float2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(float2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(float2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(float2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(float2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(float2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(float2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(float2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(float2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(float2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(float2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(float2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(float2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(float2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(float2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(float2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(float2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(long3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(long3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(long3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(long3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(long3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(long3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(long3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(long3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(long3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(long3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(long3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(long3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(long3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(long3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(long3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(long3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(long3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(long3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(long3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(long3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(long3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(long3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(long3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(long3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(long3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(long3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(long3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(long3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(long3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(long3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(long3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(long3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(ulong3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(ulong3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(ulong3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(ulong3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(ulong3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(ulong3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(ulong3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(ulong3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(ulong3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(ulong3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(ulong3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(ulong3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(ulong3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(ulong3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(ulong3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(ulong3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(ulong3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(ulong3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(ulong3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(ulong3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(ulong3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(ulong3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(ulong3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(ulong3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(ulong3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(ulong3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(ulong3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(ulong3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(ulong3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(ulong3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(ulong3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(ulong3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(int3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(int3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(int3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(int3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(int3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(int3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(int3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(int3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(int3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(int3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(int3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(int3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(int3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(int3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(int3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(int3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(int3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(int3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(int3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(int3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(int3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(int3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(int3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(int3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(int3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(int3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(int3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(int3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(int3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(int3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(int3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(int3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(uint3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(uint3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(uint3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(uint3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(uint3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(uint3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(uint3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(uint3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(uint3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(uint3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(uint3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(uint3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(uint3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(uint3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(uint3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(uint3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(uint3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(uint3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(uint3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(uint3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(uint3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(uint3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(uint3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(uint3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(uint3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(uint3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(uint3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(uint3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(uint3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(uint3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(uint3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(uint3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(short3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(short3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(short3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(short3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(short3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(short3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(short3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(short3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(short3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(short3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(short3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(short3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(short3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(short3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(short3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(short3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(short3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(short3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(short3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(short3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(short3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(short3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(short3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(short3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(short3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(short3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(short3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(short3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(short3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(short3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(short3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(short3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(ushort3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(ushort3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(ushort3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(ushort3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(ushort3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(ushort3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(ushort3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(ushort3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(ushort3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(ushort3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(ushort3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(ushort3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(ushort3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(ushort3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(ushort3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(ushort3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(ushort3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(ushort3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(ushort3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(ushort3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(ushort3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(ushort3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(ushort3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(ushort3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(ushort3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(ushort3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(ushort3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(ushort3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(ushort3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(ushort3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(ushort3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(ushort3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(char3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(char3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(char3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(char3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(char3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(char3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(char3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(char3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(char3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(char3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(char3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(char3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(char3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(char3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(char3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(char3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(char3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(char3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(char3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(char3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(char3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(char3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(char3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(char3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(char3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(char3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(char3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(char3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(char3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(char3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(char3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(char3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(uchar3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(uchar3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(uchar3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(uchar3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(uchar3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(uchar3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(uchar3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(uchar3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(uchar3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(uchar3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(uchar3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(uchar3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(uchar3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(uchar3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(uchar3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(uchar3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(uchar3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(uchar3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(uchar3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(uchar3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(uchar3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(uchar3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(uchar3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(uchar3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(uchar3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(uchar3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(uchar3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(uchar3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(uchar3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(uchar3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(uchar3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(uchar3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(float3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(float3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(float3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(float3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(float3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(float3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(float3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(float3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(float3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(float3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(float3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(float3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(float3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(float3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(float3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(float3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(float3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(float3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(float3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(float3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(float3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(float3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(float3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(float3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(float3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(float3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(float3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(float3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(float3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(float3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(float3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(float3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(long4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(long4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(long4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(long4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(long4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(long4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(long4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(long4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(long4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(long4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(long4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(long4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(long4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(long4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(long4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(long4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(long4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(long4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(long4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(long4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(long4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(long4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(long4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(long4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(long4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(long4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(long4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(long4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(long4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(long4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(long4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(long4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(ulong4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(ulong4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(ulong4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(ulong4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(ulong4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(ulong4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(ulong4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(ulong4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(ulong4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(ulong4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(ulong4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(ulong4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(ulong4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(ulong4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(ulong4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(ulong4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(ulong4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(ulong4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(ulong4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(ulong4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(ulong4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(ulong4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(ulong4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(ulong4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(ulong4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(ulong4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(ulong4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(ulong4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(ulong4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(ulong4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(ulong4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(ulong4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(int4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(int4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(int4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(int4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(int4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(int4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(int4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(int4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(int4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(int4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(int4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(int4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(int4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(int4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(int4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(int4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(int4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(int4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(int4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(int4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(int4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(int4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(int4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(int4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(int4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(int4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(int4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(int4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(int4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(int4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(int4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(int4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(uint4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(uint4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(uint4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(uint4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(uint4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(uint4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(uint4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(uint4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(uint4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(uint4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(uint4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(uint4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(uint4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(uint4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(uint4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(uint4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(uint4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(uint4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(uint4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(uint4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(uint4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(uint4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(uint4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(uint4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(uint4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(uint4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(uint4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(uint4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(uint4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(uint4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(uint4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(uint4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(short4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(short4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(short4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(short4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(short4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(short4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(short4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(short4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(short4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(short4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(short4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(short4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(short4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(short4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(short4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(short4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(short4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(short4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(short4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(short4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(short4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(short4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(short4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(short4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(short4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(short4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(short4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(short4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(short4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(short4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(short4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(short4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(ushort4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(ushort4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(ushort4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(ushort4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(ushort4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(ushort4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(ushort4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(ushort4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(ushort4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(ushort4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(ushort4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(ushort4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(ushort4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(ushort4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(ushort4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(ushort4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(ushort4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(ushort4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(ushort4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(ushort4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(ushort4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(ushort4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(ushort4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(ushort4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(ushort4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(ushort4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(ushort4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(ushort4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(ushort4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(ushort4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(ushort4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(ushort4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(char4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(char4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(char4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(char4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(char4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(char4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(char4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(char4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(char4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(char4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(char4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(char4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(char4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(char4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(char4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(char4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(char4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(char4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(char4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(char4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(char4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(char4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(char4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(char4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(char4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(char4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(char4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(char4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(char4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(char4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(char4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(char4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(uchar4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(uchar4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(uchar4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(uchar4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(uchar4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(uchar4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(uchar4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(uchar4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(uchar4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(uchar4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(uchar4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(uchar4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(uchar4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(uchar4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(uchar4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(uchar4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(uchar4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(uchar4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(uchar4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(uchar4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(uchar4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(uchar4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(uchar4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(uchar4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(uchar4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(uchar4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(uchar4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(uchar4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(uchar4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(uchar4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(uchar4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(uchar4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(float4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(float4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(float4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(float4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(float4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(float4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(float4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(float4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(float4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(float4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(float4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(float4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(float4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(float4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(float4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(float4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(float4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(float4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(float4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(float4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(float4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(float4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(float4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(float4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(float4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(float4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(float4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(float4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(float4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(float4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(float4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(float4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(long8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(long8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(long8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(long8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(long8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(long8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(long8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(long8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(long8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(long8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(long8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(long8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(long8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(long8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(long8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(long8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(long8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(long8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(long8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(long8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(long8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(long8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(long8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(long8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(long8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(long8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(long8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(long8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(long8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(long8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(long8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(long8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(ulong8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(ulong8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(ulong8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(ulong8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(ulong8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(ulong8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(ulong8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(ulong8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(ulong8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(ulong8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(ulong8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(ulong8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(ulong8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(ulong8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(ulong8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(ulong8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(ulong8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(ulong8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(ulong8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(ulong8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(ulong8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(ulong8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(ulong8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(ulong8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(ulong8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(ulong8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(ulong8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(ulong8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(ulong8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(ulong8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(ulong8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(ulong8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(int8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(int8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(int8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(int8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(int8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(int8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(int8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(int8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(int8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(int8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(int8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(int8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(int8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(int8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(int8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(int8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(int8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(int8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(int8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(int8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(int8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(int8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(int8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(int8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(int8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(int8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(int8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(int8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(int8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(int8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(int8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(int8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(uint8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(uint8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(uint8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(uint8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(uint8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(uint8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(uint8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(uint8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(uint8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(uint8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(uint8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(uint8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(uint8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(uint8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(uint8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(uint8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(uint8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(uint8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(uint8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(uint8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(uint8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(uint8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(uint8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(uint8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(uint8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(uint8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(uint8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(uint8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(uint8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(uint8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(uint8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(uint8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(short8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(short8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(short8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(short8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(short8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(short8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(short8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(short8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(short8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(short8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(short8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(short8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(short8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(short8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(short8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(short8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(short8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(short8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(short8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(short8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(short8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(short8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(short8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(short8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(short8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(short8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(short8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(short8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(short8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(short8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(short8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(short8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(ushort8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(ushort8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(ushort8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(ushort8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(ushort8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(ushort8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(ushort8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(ushort8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(ushort8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(ushort8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(ushort8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(ushort8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(ushort8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(ushort8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(ushort8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(ushort8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(ushort8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(ushort8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(ushort8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(ushort8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(ushort8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(ushort8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(ushort8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(ushort8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(ushort8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(ushort8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(ushort8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(ushort8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(ushort8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(ushort8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(ushort8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(ushort8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(char8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(char8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(char8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(char8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(char8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(char8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(char8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(char8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(char8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(char8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(char8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(char8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(char8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(char8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(char8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(char8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(char8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(char8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(char8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(char8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(char8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(char8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(char8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(char8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(char8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(char8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(char8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(char8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(char8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(char8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(char8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(char8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(uchar8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(uchar8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(uchar8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(uchar8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(uchar8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(uchar8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(uchar8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(uchar8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(uchar8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(uchar8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(uchar8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(uchar8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(uchar8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(uchar8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(uchar8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(uchar8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(uchar8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(uchar8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(uchar8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(uchar8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(uchar8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(uchar8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(uchar8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(uchar8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(uchar8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(uchar8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(uchar8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(uchar8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(uchar8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(uchar8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(uchar8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(uchar8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(float8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(float8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(float8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(float8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(float8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(float8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(float8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(float8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(float8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(float8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(float8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(float8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(float8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(float8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(float8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(float8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(float8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(float8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(float8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(float8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(float8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(float8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(float8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(float8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(float8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(float8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(float8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(float8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(float8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(float8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(float8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(float8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(long16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(long16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(long16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(long16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(long16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(long16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(long16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(long16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(long16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(long16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(long16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(long16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(long16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(long16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(long16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(long16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(long16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(long16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(long16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(long16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(long16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(long16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(long16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(long16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(long16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(long16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(long16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(long16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(long16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(long16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(long16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(long16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(ulong16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(ulong16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(ulong16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(ulong16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(ulong16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(ulong16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(ulong16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(ulong16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(ulong16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(ulong16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(ulong16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(ulong16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(ulong16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(ulong16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(ulong16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(ulong16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(ulong16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(ulong16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(ulong16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(ulong16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(ulong16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(ulong16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(ulong16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(ulong16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(ulong16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(ulong16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(ulong16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(ulong16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(ulong16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(ulong16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(ulong16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(ulong16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(int16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(int16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(int16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(int16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(int16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(int16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(int16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(int16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(int16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(int16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(int16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(int16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(int16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(int16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(int16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(int16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(int16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(int16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(int16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(int16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(int16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(int16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(int16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(int16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(int16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(int16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(int16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(int16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(int16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(int16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(int16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(int16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(uint16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(uint16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(uint16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(uint16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(uint16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(uint16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(uint16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(uint16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(uint16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(uint16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(uint16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(uint16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(uint16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(uint16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(uint16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(uint16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(uint16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(uint16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(uint16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(uint16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(uint16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(uint16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(uint16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(uint16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(uint16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(uint16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(uint16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(uint16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(uint16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(uint16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(uint16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(uint16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(short16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(short16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(short16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(short16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(short16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(short16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(short16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(short16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(short16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(short16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(short16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(short16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(short16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(short16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(short16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(short16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(short16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(short16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(short16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(short16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(short16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(short16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(short16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(short16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(short16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(short16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(short16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(short16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(short16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(short16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(short16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(short16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(ushort16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(ushort16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(ushort16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(ushort16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(ushort16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(ushort16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(ushort16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(ushort16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(ushort16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(ushort16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(ushort16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(ushort16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(ushort16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(ushort16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(ushort16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(ushort16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(ushort16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(ushort16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(ushort16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(ushort16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(ushort16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(ushort16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(ushort16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(ushort16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(ushort16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(ushort16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(ushort16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(ushort16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(ushort16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(ushort16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(ushort16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(ushort16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(char16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(char16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(char16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(char16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(char16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(char16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(char16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(char16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(char16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(char16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(char16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(char16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(char16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(char16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(char16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(char16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(char16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(char16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(char16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(char16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(char16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(char16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(char16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(char16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(char16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(char16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(char16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(char16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(char16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(char16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(char16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(char16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(uchar16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(uchar16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(uchar16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(uchar16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(uchar16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(uchar16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(uchar16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(uchar16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(uchar16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(uchar16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(uchar16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(uchar16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(uchar16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(uchar16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(uchar16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(uchar16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(uchar16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(uchar16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(uchar16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(uchar16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(uchar16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(uchar16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(uchar16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(uchar16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(uchar16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(uchar16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(uchar16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(uchar16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(uchar16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(uchar16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(uchar16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(uchar16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(float16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(float16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(float16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(float16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(float16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(float16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(float16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(float16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(float16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(float16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(float16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(float16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(float16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(float16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(float16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(float16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(float16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(float16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(float16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(float16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(float16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(float16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(float16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(float16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(float16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(float16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(float16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(float16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(float16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(float16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(float16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(float16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
diff --git a/backend/src/ocl_memcpy.ll b/backend/src/ocl_memcpy.ll
new file mode 100644
index 0000000..476033e
--- /dev/null
+++ b/backend/src/ocl_memcpy.ll
@@ -0,0 +1,336 @@
+;The memcpy's source code.
+; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) {
+;   size_t index = 0;
+;   while((index + 4) >= size) {
+;     *((uint *)(dst + index)) = *((uint *)(src + index));
+;     index += 4;
+;   }
+;   while(index < size) {
+;     dst[index] = src[index];
+;     index++;
+;   }
+; }
+
+define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %1 = load i32 addrspace(1)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  store i32 %1, i32 addrspace(1)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
+  %3 = load i8 addrspace(1)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
+  %1 = load i32 addrspace(0)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  store i32 %1, i32 addrspace(1)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
+  %3 = load i8 addrspace(0)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+  %1 = load i32 addrspace(3)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  store i32 %1, i32 addrspace(1)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
+  %3 = load i8 addrspace(3)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %1 = load i32 addrspace(1)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
+  store i32 %1, i32 addrspace(0)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
+  %3 = load i8 addrspace(1)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
+  %1 = load i32 addrspace(0)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
+  store i32 %1, i32 addrspace(0)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
+  %3 = load i8 addrspace(0)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+  %1 = load i32 addrspace(3)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
+  store i32 %1, i32 addrspace(0)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
+  %3 = load i8 addrspace(3)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %1 = load i32 addrspace(1)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
+  store i32 %1, i32 addrspace(3)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
+  %3 = load i8 addrspace(1)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
+  %1 = load i32 addrspace(0)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
+  store i32 %1, i32 addrspace(3)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
+  %3 = load i8 addrspace(0)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+  %1 = load i32 addrspace(3)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
+  store i32 %1, i32 addrspace(3)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
+  %3 = load i8 addrspace(3)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
diff --git a/backend/src/ocl_memset.ll b/backend/src/ocl_memset.ll
new file mode 100644
index 0000000..addf9f5
--- /dev/null
+++ b/backend/src/ocl_memset.ll
@@ -0,0 +1,127 @@
+;The memset's source code.
+; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) {
+;   size_t index = 0;
+;   uint v = (val << 24) | (val << 16) | (val << 8) | val;
+;   while((index + 4) >= size) {
+;     *((uint *)(dst + index)) = v;
+;     index += 4;
+;   }
+;   while(index < size) {
+;     dst[index] = val;
+;     index++;
+;  }
+; }
+
+define void @__gen_memset_p(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+  %conv = zext i8 %val to i32
+  %shl = shl nuw i32 %conv, 24
+  %shl2 = shl nuw nsw i32 %conv, 16
+  %or = or i32 %shl, %shl2
+  %shl4 = shl nuw nsw i32 %conv, 8
+  %or5 = or i32 %or, %shl4
+  %or7 = or i32 %or5, %conv
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond10, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8* %dst, i32 %index.0
+  %0 = bitcast i8* %add.ptr to i32*
+  store i32 %or7, i32* %0, align 4
+  br label %while.cond
+
+while.cond10:                                     ; preds = %while.cond, %while.body13
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
+  %cmp11 = icmp ult i32 %index.1, %size
+  br i1 %cmp11, label %while.body13, label %while.end14
+
+while.body13:                                     ; preds = %while.cond10
+  %arrayidx = getelementptr inbounds i8* %dst, i32 %index.1
+  store i8 %val, i8* %arrayidx, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond10
+
+while.end14:                                      ; preds = %while.cond10
+  ret void
+}
+
+define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+  %conv = zext i8 %val to i32
+  %shl = shl nuw i32 %conv, 24
+  %shl2 = shl nuw nsw i32 %conv, 16
+  %or = or i32 %shl, %shl2
+  %shl4 = shl nuw nsw i32 %conv, 8
+  %or5 = or i32 %or, %shl4
+  %or7 = or i32 %or5, %conv
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond10, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  store i32 %or7, i32 addrspace(1)* %0, align 4
+  br label %while.cond
+
+while.cond10:                                     ; preds = %while.cond, %while.body13
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
+  %cmp11 = icmp ult i32 %index.1, %size
+  br i1 %cmp11, label %while.body13, label %while.end14
+
+while.body13:                                     ; preds = %while.cond10
+  %arrayidx = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+  store i8 %val, i8 addrspace(1)* %arrayidx, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond10
+
+while.end14:                                      ; preds = %while.cond10
+  ret void
+}
+
+define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+  %conv = zext i8 %val to i32
+  %shl = shl nuw i32 %conv, 24
+  %shl2 = shl nuw nsw i32 %conv, 16
+  %or = or i32 %shl, %shl2
+  %shl4 = shl nuw nsw i32 %conv, 8
+  %or5 = or i32 %or, %shl4
+  %or7 = or i32 %or5, %conv
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond10, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+  store i32 %or7, i32 addrspace(3)* %0, align 4
+  br label %while.cond
+
+while.cond10:                                     ; preds = %while.cond, %while.body13
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
+  %cmp11 = icmp ult i32 %index.1, %size
+  br i1 %cmp11, label %while.body13, label %while.end14
+
+while.body13:                                     ; preds = %while.cond10
+  %arrayidx = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+  store i8 %val, i8 addrspace(3)* %arrayidx, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond10
+
+while.end14:                                      ; preds = %while.cond10
+  ret void
+}
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
old mode 100644
new mode 100755
index 7948b7c..d191b8e
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -25,6 +25,8 @@
 #define PURE __attribute__((pure))
 #define CONST __attribute__((const))
 #define INLINE_OVERLOADABLE inline __attribute__((overloadable,always_inline))
+// FIXME, clang's opencl FE doesn't support static.
+#define static
 
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL built-in scalar data types
@@ -85,7 +87,7 @@ struct _image2d_t;
 typedef __texture struct _image2d_t* __image2d_t;
 struct _image3d_t;
 typedef __texture struct _image3d_t* __image3d_t;
-typedef const uint __sampler_t;
+typedef const ushort __sampler_t;
 typedef size_t __event_t;
 #define image2d_t __image2d_t
 #define image3d_t __image3d_t
@@ -114,6 +116,14 @@ typedef size_t __event_t;
 #define __kernel_exec(X, TYPE) __kernel __attribute__((work_group_size_hint(X,1,1))) \
                                         __attribute__((vec_type_hint(TYPE)))
 #define kernel_exec(X, TYPE) __kernel_exec(X, TYPE)
+#define cl_khr_global_int32_base_atomics
+#define cl_khr_global_int32_extended_atomics
+#define cl_khr_local_int32_base_atomics
+#define cl_khr_local_int32_extended_atomics
+#define cl_khr_byte_addressable_store
+#define cl_khr_icd
+#define cl_khr_gl_sharing
+
 /////////////////////////////////////////////////////////////////////////////
 // OpenCL floating-point macros and pragmas
 /////////////////////////////////////////////////////////////////////////////
@@ -130,9 +140,50 @@ typedef size_t __event_t;
 #define FLT_EPSILON 0x1.0p-23f
 
 #define MAXFLOAT     3.40282347e38F
-#define HUGE_VALF    (__builtin_huge_valf())
-#define INFINITY     (__builtin_inff())
-#define NAN          (__builtin_nanf(""))
+INLINE_OVERLOADABLE float __ocl_inff(void) {
+  union { uint u; float f; } u;
+  u.u = 0x7F800000;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __ocl_nanf(void) {
+  union { uint u; float f; } u;
+  u.u = 0x7F800001;
+  return u.f;
+}
+typedef union
+{
+  float value;
+  uint  word;
+} float_shape_type;
+
+/* Get a 32 bit int from a float.  */
+#ifndef GEN_OCL_GET_FLOAT_WORD
+# define GEN_OCL_GET_FLOAT_WORD(i,d)  \
+do {                                  \
+  float_shape_type gf_u;              \
+  gf_u.value = (d);                   \
+  (i) = gf_u.word;                    \
+} while (0)
+#endif
+/* Set a float from a 32 bit int.  */
+#ifndef GEN_OCL_SET_FLOAT_WORD
+# define GEN_OCL_SET_FLOAT_WORD(d,i)  \
+do {                                  \
+  float_shape_type sf_u;              \
+  sf_u.word = (i);                    \
+  (d) = sf_u.value;                   \
+} while (0)
+#endif
+
+INLINE_OVERLOADABLE int __ocl_finitef (float x){
+  unsigned ix;
+  GEN_OCL_GET_FLOAT_WORD (ix, x);
+  return (ix & 0x7fffffff) < 0x7f800000;
+}
+
+#define HUGE_VALF    (__ocl_inff())
+#define INFINITY     (__ocl_inff())
+#define NAN          (__ocl_nanf())
 #define M_E_F        2.718281828459045F
 #define M_LOG2E_F    1.4426950408889634F
 #define M_LOG10E_F   0.43429448190325176F
@@ -219,17 +270,32 @@ UDEF(uint);
 UDEF(ulong);
 #undef UDEF
 
-INLINE_OVERLOADABLE int isfinite(float x) { return __builtin_isfinite(x); }
-INLINE_OVERLOADABLE int isinf(float x) { return __builtin_isinf(x); }
+INLINE_OVERLOADABLE int isfinite(float x) {
+  union { uint u; float f; } u;
+  u.f = x;
+  return (u.u & 0x7FFFFFFF) < 0x7F800000;
+}
+INLINE_OVERLOADABLE int isinf(float x) {
+  union { uint u; float f; } u;
+  u.f = x;
+  return (u.u & 0x7FFFFFFF) == 0x7F800000;
+}
 INLINE_OVERLOADABLE int isnan(float x) {
+  return x != x;
+}
+INLINE_OVERLOADABLE int isnormal(float x) {
   union { uint u; float f; } u;
   u.f = x;
-  return (u.u & 0x7FFFFFFF) > 0x7F800000;
+  u.u &= 0x7FFFFFFF;
+  return (u.u < 0x7F800000) && (u.u >= 0x800000);
 }
-INLINE_OVERLOADABLE int isnormal(float x) { return __builtin_isnormal(x); }
 INLINE_OVERLOADABLE int isordered(float x, float y) { return isequal(x, x) && isequal(y, y); }
 INLINE_OVERLOADABLE int isunordered(float x, float y) { return isnan(x) || isnan(y); }
-INLINE_OVERLOADABLE int signbit(float x) { return __builtin_signbit(x); }
+INLINE_OVERLOADABLE int signbit(float x) {
+  union { uint u; float f; } u;
+  u.f = x;
+  return u.u >> 31;
+}
 
 #define DEC1(type) INLINE_OVERLOADABLE int any(type a) { return a<0; }
 #define DEC2(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0; }
@@ -622,13 +688,795 @@ PURE CONST float __gen_ocl_cos(float x);
 PURE CONST float __gen_ocl_sqrt(float x);
 PURE CONST float __gen_ocl_rsqrt(float x);
 PURE CONST float __gen_ocl_log(float x);
+PURE CONST float __gen_ocl_exp(float x);
 PURE CONST float __gen_ocl_pow(float x, float y);
 PURE CONST float __gen_ocl_rcp(float x);
 PURE CONST float __gen_ocl_rndz(float x);
 PURE CONST float __gen_ocl_rnde(float x);
 PURE CONST float __gen_ocl_rndu(float x);
 PURE CONST float __gen_ocl_rndd(float x);
-INLINE_OVERLOADABLE float hypot(float x, float y) { return __gen_ocl_sqrt(x*x + y*y); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
+  union { unsigned u; float f; } ux, uy;
+  ux.f = x;
+  uy.f = y;
+  ux.u = (ux.u & 0x7fffffff) | (uy.u & 0x80000000u);
+  return ux.f;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_log(float x) {
+/*
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  union { unsigned int i; float f; } u;
+  const float
+  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
+  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
+  two25 =    3.355443200e+07, /* 0x4c000000 */
+  Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
+  Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
+  Lg3 = 2.8571429849e-01, /* 3E924925 */
+  Lg4 = 2.2222198546e-01, /* 3E638E29 */
+  Lg5 = 1.8183572590e-01, /* 3E3A3325 */
+  Lg6 = 1.5313838422e-01, /* 3E1CD04F */
+  Lg7 = 1.4798198640e-01; /* 3E178897 */
+
+  const float zero   =  0.0;
+  float hfsq,f,s,z,R,w,t1,t2,dk;
+  int k,ix,i,j;
+
+  u.f = x;  ix = u.i;
+  k=0;
+  if (ix < 0x00800000) {      /* x < 2**-126  */
+      if ((ix&0x7fffffff)==0)
+    return -two25/zero;   /* log(+-0)=-inf */
+      if (ix<0) return (x-x)/zero;  /* log(-#) = NaN */
+      return -INFINITY;  /* Gen does not support subnormal number now */
+      //k -= 25; x *= two25; /* subnormal number, scale up x */
+      //u.f = x;  ix = u.i;
+  }
+  if (ix >= 0x7f800000) return x+x;
+  k += (ix>>23)-127;
+  ix &= 0x007fffff;
+  i = (ix+(0x95f64<<3))&0x800000;
+  u.i = ix|(i^0x3f800000); x = u.f;
+  k += (i>>23);
+  f = x-(float)1.0;
+  if((0x007fffff&(15+ix))<16) { /* |f| < 2**-20 */
+      if(f==zero) {
+        if(k==0) return zero;
+        else {
+          dk=(float)k; return dk*ln2_hi+dk*ln2_lo;
+        }
+      }
+      R = f*f*((float)0.5-(float)0.33333333333333333*f);
+      if(k==0)
+        return f-R;
+      else {
+        dk=(float)k;  return dk*ln2_hi-((R-dk*ln2_lo)-f);
+      }
+  }
+  s = f/((float)2.0+f);
+  dk = (float)k;
+  z = s*s;
+  i = ix-(0x6147a<<3);
+  w = z*z;
+  j = (0x6b851<<3)-ix;
+  t1= w*(Lg2+w*(Lg4+w*Lg6));
+  t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
+  i |= j;
+  R = t2+t1;
+  if(i>0) {
+      hfsq=(float)0.5*f*f;
+      if(k==0) return f-(hfsq-s*(hfsq+R)); else
+         return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
+  } else {
+      if(k==0) return f-s*(f-R); else
+         return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
+  }
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_log10(float x) {
+/*
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  union {float f; unsigned i; }u;
+  const float
+  zero       = 0.0,
+  two25      =  3.3554432000e+07, /* 0x4c000000 */
+  ivln10     =  4.3429449201e-01, /* 0x3ede5bd9 */
+  log10_2hi  =  3.0102920532e-01, /* 0x3e9a2080 */
+  log10_2lo  =  7.9034151668e-07; /* 0x355427db */
+
+  float y,z;
+  int i,k,hx;
+
+  u.f = x; hx = u.i;
+  k=0;
+  if (hx < 0x00800000) {                  /* x < 2**-126  */
+    if ((hx&0x7fffffff)==0)
+      return -two25/zero;             /* log(+-0)=-inf */
+    if (hx<0) return NAN;        /* log(-#) = NaN */
+    return -INFINITY;      /* Gen does not support subnormal now */
+    //k -= 25; x *= two25; /* subnormal number, scale up x */
+    //u.f = x; hx = u.i;
+  }
+  if (hx >= 0x7f800000) return x+x;
+  k += (hx>>23)-127;
+  i  = ((unsigned)k&0x80000000)>>31;
+  hx = (hx&0x007fffff)|((0x7f-i)<<23);
+  y  = (float)(k+i);
+  u.i = hx; x = u.f;
+  z  = y*log10_2lo + ivln10*__gen_ocl_internal_log(x);
+  return  z+y*log10_2hi;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_log2(float x) {
+/*
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ *  adapted for log2 by Ulrich Drepper <drepper at cygnus.com>
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  const float zero   =  0.0,
+  ln2 = 0.69314718055994530942,
+  two25 =    3.355443200e+07, /** 0x4c000000 */
+  Lg1 = 6.6666668653e-01, /** 3F2AAAAB */
+  Lg2 = 4.0000000596e-01, /** 3ECCCCCD */
+  Lg3 = 2.8571429849e-01, /** 3E924925 */
+  Lg4 = 2.2222198546e-01, /** 3E638E29 */
+  Lg5 = 1.8183572590e-01, /** 3E3A3325 */
+  Lg6 = 1.5313838422e-01, /** 3E1CD04F */
+  Lg7 = 1.4798198640e-01; /** 3E178897 */
+
+  float hfsq,f,s,z,R,w,t1,t2,dk;
+  int k,ix,i,j;
+
+  union {float f; int i; }u;//GET_FLOAT_WORD(ix,x);
+  u.f = x; ix = u.i;
+
+  k=0;
+  if (ix < 0x00800000) {           /** x < 2**-126  */
+      if ((ix&0x7fffffff)==0)
+      return -two25/(x-x);        /** log(+-0)=-inf */
+
+      if (ix<0) return (x-x)/(x-x);    /** log(-#) = NaN */
+      return -INFINITY;
+      k -= 25; x *= two25; /** subnormal number, scale up x */
+      u.f = x; ix = u.i; //GET_FLOAT_WORD(ix,x);
+  }
+
+  if (ix >= 0x7f800000) return x+x;
+
+  k += (ix>>23)-127;
+  ix &= 0x007fffff;
+  i = (ix+(0x95f64<<3))&0x800000;
+
+  u.i = ix|(i^0x3f800000); x = u.f;//SET_FLOAT_WORD(x,ix|(i^0x3f800000));    /** normalize x or x/2 */
+  k += (i>>23);
+  dk = (float)k;
+  f = x-(float)1.0;
+
+  if((0x007fffff&(15+ix))<16) {    /** |f| < 2**-20 */
+      if(f==zero) return dk;
+
+      R = f*f*((float)0.5-(float)0.33333333333333333*f);
+      return dk-(R-f)/ln2;
+  }
+
+  s = f/((float)2.0+f);
+  z = s*s;
+  i = ix-(0x6147a<<3);
+  w = z*z;
+  j = (0x6b851<<3)-ix;
+  t1= w*(Lg2+w*(Lg4+w*Lg6));
+  t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
+  i |= j;
+  R = t2+t1;
+
+  if(i>0) {
+      hfsq=(float)0.5*f*f;
+      return dk-((hfsq-(s*(hfsq+R)))-f)/ln2;
+  } else {
+      return dk-((s*(f-R))-f)/ln2;
+  }
+}
+
+INLINE float __gen_ocl_scalbnf (float x, int n){
+  /* copy from fdlibm */
+  float two25 = 3.355443200e+07,	/* 0x4c000000 */
+  twom25 = 2.9802322388e-08,	        /* 0x33000000 */
+  huge = 1.0e+30,
+  tiny = 1.0e-30;
+  int k,ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  k = (ix&0x7f800000)>>23; /* extract exponent */
+  if (k==0) {	/* 0 or subnormal x */
+    if ((ix&0x7fffffff)==0) return x; /* +-0 */
+    x *= two25;
+    GEN_OCL_GET_FLOAT_WORD(ix,x);
+    k = ((ix&0x7f800000)>>23) - 25;
+  }
+  if (k==0xff) return x+x;	/* NaN or Inf */
+  if (n< -50000)
+    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
+  if (n> 50000 || k+n > 0xfe)
+    return huge*__gen_ocl_internal_copysign(huge,x); /* overflow  */
+  /* Now k and n are bounded we know that k = k+n does not overflow. */
+  k = k+n;
+  if (k > 0) { /* normal result */
+    GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
+    return x;
+  }
+  if (k <= -25)
+    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
+  k += 25;				/* subnormal result */
+  GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
+  return x*twom25;
+}
+
+
+
+__constant const float PIo2[] = {
+  1.5703125000e+00, /* 0x3fc90000 */
+  4.5776367188e-04, /* 0x39f00000 */
+  2.5987625122e-05, /* 0x37da0000 */
+  7.5437128544e-08, /* 0x33a20000 */
+  6.0026650317e-11, /* 0x2e840000 */
+  7.3896444519e-13, /* 0x2b500000 */
+  5.3845816694e-15, /* 0x27c20000 */
+  5.6378512969e-18, /* 0x22d00000 */
+  8.3009228831e-20, /* 0x1fc40000 */
+  3.2756352257e-22, /* 0x1bc60000 */
+  6.3331015649e-25, /* 0x17440000 */
+};
+
+INLINE int __kernel_rem_pio2f(float *x, float *y, int e0, int nx, int prec, const __constant int *ipio2)
+{
+  /* copied from fdlibm */
+const float
+zero   = 0.0,
+one    = 1.0,
+two8   =  2.5600000000e+02, /* 0x43800000 */
+twon8  =  3.9062500000e-03; /* 0x3b800000 */
+
+  int init_jk[3]; /* initial value for jk */
+  int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
+  float z,fw,f[20],fq[20],q[20];
+  init_jk[0] = 4; init_jk[1] = 7; init_jk[2] = 9;
+    /* initialize jk*/
+  jk = init_jk[prec];
+  jp = jk;
+
+    /* determine jx,jv,q0, note that 3>q0 */
+  jx =  nx-1;
+  jv = (e0-3)/8; if(jv<0) jv=0;
+  q0 =  e0-8*(jv+1);
+
+    /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
+  j = jv-jx; m = jx+jk;
+  for(i=0;i<=m;i++,j++) f[i] = (j<0)? zero : (float) ipio2[j];
+
+    /* compute q[0],q[1],...q[jk] */
+  for (i=0;i<=jk;i++) {
+      for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
+  }
+
+  jz = jk;
+recompute:
+    /* distill q[] into iq[] reversingly */
+  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
+      fw    =  (float)((int)(twon8* z));
+      iq[i] =  (int)(z-two8*fw);
+      z     =  q[j-1]+fw;
+  }
+
+    /* compute n */
+  z  = __gen_ocl_scalbnf(z,q0);   /* actual value of z */
+  z -= (float)8.0*__gen_ocl_internal_floor(z*(float)0.125); /* trim off integer >= 8 */
+  n  = (int) z;
+  z -= (float)n;
+  ih = 0;
+  if(q0>0) {  /* need iq[jz-1] to determine n */
+      i  = (iq[jz-1]>>(8-q0)); n += i;
+      iq[jz-1] -= i<<(8-q0);
+      ih = iq[jz-1]>>(7-q0);
+  }
+  else if(q0==0) ih = iq[jz-1]>>8;
+  else if(z>=(float)0.5) ih=2;
+
+  if(ih>0) {  /* q > 0.5 */
+      n += 1; carry = 0;
+      for(i=0;i<jz ;i++) {  /* compute 1-q */
+    j = iq[i];
+    if(carry==0) {
+        if(j!=0) {
+      carry = 1; iq[i] = 0x100- j;
+        }
+    } else  iq[i] = 0xff - j;
+      }
+      if(q0>0) {    /* rare case: chance is 1 in 12 */
+          switch(q0) {
+          case 1:
+           iq[jz-1] &= 0x7f; break;
+        case 2:
+           iq[jz-1] &= 0x3f; break;
+          }
+      }
+      if(ih==2) {
+    z = one - z;
+    if(carry!=0) z -= __gen_ocl_scalbnf(one,q0);
+      }
+  }
+
+    /* check if recomputation is needed */
+  if(z==zero) {
+      j = 0;
+      for (i=jz-1;i>=jk;i--) j |= iq[i];
+      if(j==0) { /* need recomputation */
+    for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
+
+    for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
+        f[jx+i] = (float) ipio2[jv+i];
+        for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
+        q[i] = fw;
+    }
+    jz += k;
+    goto recompute;
+      }
+  }
+
+    /* chop off zero terms */
+  if(z==(float)0.0) {
+      jz -= 1; q0 -= 8;
+      while(iq[jz]==0) { jz--; q0-=8;}
+  } else { /* break z into 8-bit if necessary */
+      z = __gen_ocl_scalbnf(z,-q0);
+      if(z>=two8) {
+    fw = (float)((int)(twon8*z));
+    iq[jz] = (int)(z-two8*fw);
+    jz += 1; q0 += 8;
+    iq[jz] = (int) fw;
+      } else iq[jz] = (int) z ;
+  }
+
+    /* convert integer "bit" chunk to floating-point value */
+  fw = __gen_ocl_scalbnf(one,q0);
+  for(i=jz;i>=0;i--) {
+      q[i] = fw*(float)iq[i]; fw*=twon8;
+  }
+
+    /* compute PIo2[0,...,jp]*q[jz,...,0] */
+  for(i=jz;i>=0;i--) {
+      for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
+      fq[jz-i] = fw;
+  }
+
+    /* compress fq[] into y[] */
+  switch(prec) {
+      case 0:
+    fw = 0.0;
+    for (i=jz;i>=0;i--) fw += fq[i];
+    y[0] = (ih==0)? fw: -fw;
+    break;
+      case 1:
+      case 2:
+    fw = 0.0;
+    for (i=jz;i>=0;i--) fw += fq[i];
+    y[0] = (ih==0)? fw: -fw;
+    fw = fq[0]-fw;
+    for (i=1;i<=jz;i++) fw += fq[i];
+    y[1] = (ih==0)? fw: -fw;
+    break;
+      case 3: /* painful */
+    for (i=jz;i>0;i--) {
+        fw      = fq[i-1]+fq[i];
+        fq[i]  += fq[i-1]-fw;
+        fq[i-1] = fw;
+    }
+    for (i=jz;i>1;i--) {
+        fw      = fq[i-1]+fq[i];
+        fq[i]  += fq[i-1]-fw;
+        fq[i-1] = fw;
+    }
+    for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
+    if(ih==0) {
+        y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
+    } else {
+        y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
+    }
+  }
+  return n&7;
+
+}
+__constant const int npio2_hw[32] = {
+0x3fc90f00, 0x40490f00, 0x4096cb00, 0x40c90f00, 0x40fb5300, 0x4116cb00,
+0x412fed00, 0x41490f00, 0x41623100, 0x417b5300, 0x418a3a00, 0x4196cb00,
+0x41a35c00, 0x41afed00, 0x41bc7e00, 0x41c90f00, 0x41d5a000, 0x41e23100,
+0x41eec200, 0x41fb5300, 0x4203f200, 0x420a3a00, 0x42108300, 0x4216cb00,
+0x421d1400, 0x42235c00, 0x4229a500, 0x422fed00, 0x42363600, 0x423c7e00,
+0x4242c700, 0x42490f00
+};
+
+__constant const int two_over_pi[22*9] = {
+0xA2, 0xF9, 0x83, 0x6E, 0x4E, 0x44, 0x15, 0x29, 0xFC,
+0x27, 0x57, 0xD1, 0xF5, 0x34, 0xDD, 0xC0, 0xDB, 0x62,
+0x95, 0x99, 0x3C, 0x43, 0x90, 0x41, 0xFE, 0x51, 0x63,
+0xAB, 0xDE, 0xBB, 0xC5, 0x61, 0xB7, 0x24, 0x6E, 0x3A,
+0x42, 0x4D, 0xD2, 0xE0, 0x06, 0x49, 0x2E, 0xEA, 0x09,
+0xD1, 0x92, 0x1C, 0xFE, 0x1D, 0xEB, 0x1C, 0xB1, 0x29,
+0xA7, 0x3E, 0xE8, 0x82, 0x35, 0xF5, 0x2E, 0xBB, 0x44,
+0x84, 0xE9, 0x9C, 0x70, 0x26, 0xB4, 0x5F, 0x7E, 0x41,
+0x39, 0x91, 0xD6, 0x39, 0x83, 0x53, 0x39, 0xF4, 0x9C,
+0x84, 0x5F, 0x8B, 0xBD, 0xF9, 0x28, 0x3B, 0x1F, 0xF8,
+0x97, 0xFF, 0xDE, 0x05, 0x98, 0x0F, 0xEF, 0x2F, 0x11,
+0x8B, 0x5A, 0x0A, 0x6D, 0x1F, 0x6D, 0x36, 0x7E, 0xCF,
+0x27, 0xCB, 0x09, 0xB7, 0x4F, 0x46, 0x3F, 0x66, 0x9E,
+0x5F, 0xEA, 0x2D, 0x75, 0x27, 0xBA, 0xC7, 0xEB, 0xE5,
+0xF1, 0x7B, 0x3D, 0x07, 0x39, 0xF7, 0x8A, 0x52, 0x92,
+0xEA, 0x6B, 0xFB, 0x5F, 0xB1, 0x1F, 0x8D, 0x5D, 0x08,
+0x56, 0x03, 0x30, 0x46, 0xFC, 0x7B, 0x6B, 0xAB, 0xF0,
+0xCF, 0xBC, 0x20, 0x9A, 0xF4, 0x36, 0x1D, 0xA9, 0xE3,
+0x91, 0x61, 0x5E, 0xE6, 0x1B, 0x08, 0x65, 0x99, 0x85,
+0x5F, 0x14, 0xA0, 0x68, 0x40, 0x8D, 0xFF, 0xD8, 0x80,
+0x4D, 0x73, 0x27, 0x31, 0x06, 0x06, 0x15, 0x56, 0xCA,
+0x73, 0xA8, 0xC9, 0x60, 0xE2, 0x7B, 0xC0, 0x8C, 0x6B,
+};
+
+
+
+INLINE int __ieee754_rem_pio2f(float x, float *y) {
+  /* copied from fdlibm */
+  float z,w,t,r,fn;
+  float tx[3];
+
+const float half_value = 5.0000000e-1;
+const float zero =  0.0000000000;
+const float two8 =  2.5600000000e+02;
+const float invpio2 =  6.3661980629e-01;
+const float pio2_1  =  1.5707855225e+00;
+const float pio2_1t =  1.0804334124e-05;
+const float pio2_2  =  1.0804273188e-05;
+const float pio2_2t =  6.0770999344e-11;
+const float pio2_3  =  6.0770943833e-11;
+const float pio2_3t =  6.1232342629e-17;
+  int e0,i,j,nx,n,ix,hx;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  if(ix<=0x3f490fd8)   /* |x| ~<= pi/4 , no need for reduction */
+      {y[0] = x; y[1] = 0; return 0;}
+  if(ix<0x4016cbe4) {  /* |x| < 3pi/4, special case with n=+-1 */
+      if(hx>0) {
+    z = x - pio2_1;
+    if((ix&0xfffffff0)!=0x3fc90fd0) { /* 24+24 bit pi OK */
+        y[0] = z - pio2_1t;
+        y[1] = (z-y[0])-pio2_1t;
+    } else {    /* near pi/2, use 24+24+24 bit pi */
+        z -= pio2_2;
+        y[0] = z - pio2_2t;
+        y[1] = (z-y[0])-pio2_2t;
+    }
+    return 1;
+      } else {  /* negative x */
+    z = x + pio2_1;
+    if((ix&0xfffffff0)!=0x3fc90fd0) { /* 24+24 bit pi OK */
+        y[0] = z + pio2_1t;
+        y[1] = (z-y[0])+pio2_1t;
+    } else {    /* near pi/2, use 24+24+24 bit pi */
+        z += pio2_2;
+        y[0] = z + pio2_2t;
+        y[1] = (z-y[0])+pio2_2t;
+    }
+    return -1;
+      }
+  }
+  if(ix<=0x43490f80) { /* |x| ~<= 2^7*(pi/2), medium size */
+      t  = __gen_ocl_fabs(x);
+      n  = (int) (t*invpio2+half_value);
+      fn = (float)n;
+      r  = t-fn*pio2_1;
+      w  = fn*pio2_1t;  /* 1st round good to 40 bit */
+      if(n<32&&(ix&0xffffff00)!=npio2_hw[n-1]) {
+    y[0] = r-w; /* quick check no cancellation */
+      } else {
+          uint high;
+          j  = ix>>23;
+          y[0] = r-w;
+    GEN_OCL_GET_FLOAT_WORD(high,y[0]);
+          i = j-((high>>23)&0xff);
+          if(i>8) {  /* 2nd iteration needed, good to 57 */
+        t  = r;
+        w  = fn*pio2_2;
+        r  = t-w;
+        w  = fn*pio2_2t-((t-r)-w);
+        y[0] = r-w;
+        GEN_OCL_GET_FLOAT_WORD(high,y[0]);
+        i = j-((high>>23)&0xff);
+        if(i>25)  { /* 3rd iteration need, 74 bits acc */
+          t  = r; /* will cover all possible cases */
+          w  = fn*pio2_3;
+          r  = t-w;
+          w  = fn*pio2_3t-((t-r)-w);
+          y[0] = r-w;
+        }
+    }
+      }
+      y[1] = (r-y[0])-w;
+      if(hx<0)  {y[0] = -y[0]; y[1] = -y[1]; return -n;}
+      else   return n;
+  }
+    /*
+     * all other (large) arguments
+     */
+  if(ix>=0x7f800000) {    /* x is inf or NaN */
+      y[0]=y[1]=x-x; return 0;
+  }
+    /* set z = scalbn(|x|,ilogb(x)-7) */
+  e0  = (ix>>23)-134;   /* e0 = ilogb(z)-7; */
+  GEN_OCL_SET_FLOAT_WORD(z, ix - ((int)(e0<<23)));
+  for(i=0;i<2;i++) {
+    tx[i] = (float)((int)(z));
+    z     = (z-tx[i])*two8;
+  }
+  tx[2] = z;
+  nx = 3;
+  while(tx[nx-1]==zero) nx--; /* skip zero term */
+  n  =  __kernel_rem_pio2f(tx,y,e0,nx,2,two_over_pi);
+  if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
+  return n;
+}
+
+INLINE_OVERLOADABLE float __kernel_sinf(float x, float y, int iy)
+{
+  /* copied from fdlibm */
+const float
+half_value =  5.0000000000e-01,/* 0x3f000000 */
+S1  = -1.6666667163e-01, /* 0xbe2aaaab */
+S2  =  8.3333337680e-03, /* 0x3c088889 */
+S3  = -1.9841270114e-04, /* 0xb9500d01 */
+S4  =  2.7557314297e-06, /* 0x3638ef1b */
+S5  = -2.5050759689e-08, /* 0xb2d72f34 */
+S6  =  1.5896910177e-10; /* 0x2f2ec9d3 */
+  float z,r,v;
+  int ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  ix &= 0x7fffffff;     /* high word of x */
+  if(ix<0x32000000)     /* |x| < 2**-27 */
+     {if((int)x==0) return x;}    /* generate inexact */
+  z =  x*x;
+  v =  z*x;
+  r =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
+  if(iy==0) return x+v*(S1+z*r);
+  else      return x-((z*(half_value*y-v*r)-y)-v*S1);
+}
+
+INLINE  float __kernel_cosf(float x, float y)
+{
+  /* copied from fdlibm */
+  const float
+  one =  1.0000000000e+00, /* 0x3f800000 */
+  C1  =  4.1666667908e-02, /* 0x3d2aaaab */
+  C2  = -1.3888889225e-03, /* 0xbab60b61 */
+  C3  =  2.4801587642e-05, /* 0x37d00d01 */
+  C4  = -2.7557314297e-07, /* 0xb493f27c */
+  C5  =  2.0875723372e-09, /* 0x310f74f6 */
+  C6  = -1.1359647598e-11; /* 0xad47d74e */
+  const float pio2_hi = 0x1.92p0, pio2_mid = 0x1.fb4p-12, pio2_low = 0x1.4442d2p-24;
+  float a,hz,z,r,qx;
+  int ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  ix &= 0x7fffffff;     /* ix = |x|'s high word*/
+  if(ix<0x32000000) {     /* if x < 2**27 */
+      if(((int)x)==0) return one;   /* generate inexact */
+  }
+
+  if(x < 0.0f) { x= -x; y = -y; }
+  if(ix > 0x3f490fdb) { /* |x|>pi/4*/
+    return -__kernel_sinf(x-pio2_hi-pio2_mid-pio2_low, y, 1);
+  }
+  z  = x*x;
+  r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
+  if(ix < 0x3e99999a)       /* if |x| < 0.3 */
+      return one - ((float)0.5*z - (z*r - x*y));
+  else {
+      GEN_OCL_SET_FLOAT_WORD(qx,ix-0x01000000); /* x/4 */
+      hz = (float)0.5*z-qx;
+      a  = one-qx;
+      return a - (hz - (z*r-x*y));
+  }
+}
+
+INLINE_OVERLOADABLE  float sin(float x) {
+  /* copied from fdlibm */
+  float y[2],z=0.0;
+  int n, ix;
+
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+    /* |x| ~< pi/4 */
+  ix &= 0x7fffffff;
+  if(ix <= 0x3f490fd8) return __kernel_sinf(x,z,0);
+
+    /* sin(Inf or NaN) is NaN */
+  else if (ix>=0x7f800000) return x-x;
+
+    /* argument reduction needed */
+  else {
+      n = __ieee754_rem_pio2f(x,y);
+      switch(n&3) {
+    case 0: return  __kernel_sinf(y[0],y[1],1);
+    case 1: return  __kernel_cosf(y[0],y[1]);
+    case 2: return -__kernel_sinf(y[0],y[1],1);
+    default:
+      return -__kernel_cosf(y[0],y[1]);
+      }
+  }
+}
+INLINE_OVERLOADABLE  float cos(float x) {
+  /* copied from fdlibm */
+  float y[2],z=0.0;
+  int n, ix;
+
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+    /* |x| ~< pi/4 */
+  ix &= 0x7fffffff;
+  if(ix <= 0x3f490fd8) return __kernel_cosf(x,z);
+
+    /* cos(Inf or NaN) is NaN */
+  else if (ix>=0x7f800000) return x-x;
+
+    /* argument reduction needed */
+  else {
+      n = __ieee754_rem_pio2f(x,y);
+      switch(n&3) {
+    case 0: return  __kernel_cosf(y[0],y[1]);
+    case 1: return  -__kernel_sinf(y[0],y[1],1);
+    case 2: return -__kernel_cosf(y[0],y[1]);
+    default:
+      return __kernel_sinf(y[0],y[1],1);
+      }
+  }
+}
+
+INLINE float __kernel_tanf(float x, float y, int iy)
+{
+  /* copied from fdlibm */
+        float z,r,v,w,s;
+        int ix,hx;
+        const float
+        one   =  1.0000000000e+00, /* 0x3f800000 */
+        pio4  =  7.8539812565e-01, /* 0x3f490fda */
+        pio4lo=  3.7748947079e-08; /* 0x33222168 */
+        float T[13];// =  {
+         T[0] = 3.3333334327e-01; /* 0x3eaaaaab */
+         T[1] = 1.3333334029e-01; /* 0x3e088889 */
+         T[2] = 5.3968254477e-02; /* 0x3d5d0dd1 */
+         T[3] = 2.1869488060e-02; /* 0x3cb327a4 */
+         T[4] = 8.8632395491e-03; /* 0x3c11371f */
+         T[5] = 3.5920790397e-03; /* 0x3b6b6916 */
+         T[6] = 1.4562094584e-03; /* 0x3abede48 */
+         T[7] = 5.8804126456e-04; /* 0x3a1a26c8 */
+         T[8] = 2.4646313977e-04; /* 0x398137b9 */
+         T[9] = 7.8179444245e-05; /* 0x38a3f445 */
+         T[10] = 7.1407252108e-05; /* 0x3895c07a */
+         T[11] = -1.8558637748e-05; /* 0xb79bae5f */
+         T[12] = 2.5907305826e-05; /* 0x37d95384 */
+
+
+        GEN_OCL_GET_FLOAT_WORD(hx,x);
+        ix = hx&0x7fffffff;     /* high word of |x| */
+        if(ix<0x31800000)                       /* x < 2**-28 */
+            {if((int)x==0) {                    /* generate inexact */
+                if((ix|(iy+1))==0) return one/__gen_ocl_fabs(x);
+                else return (iy==1)? x: -one/x;
+            }
+            }
+        if(ix>=0x3f2ca140) {                    /* |x|>=0.6744 */
+            if(hx<0) {x = -x; y = -y;}
+
+
+            z = pio4-x;
+            w = pio4lo-y;
+            x = z+w; y = 0.0;
+        }
+        z       =  x*x;
+        w       =  z*z;
+    /* Break x^5*(T[1]+x^2*T[2]+...) into
+     *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
+     *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
+     */
+        r = T[1]+w*(T[3]+w*(T[5]+w*(T[7]+w*(T[9]+w*T[11]))));
+        v = z*(T[2]+w*(T[4]+w*(T[6]+w*(T[8]+w*(T[10]+w*T[12])))));
+        s = z*x;
+        r = y + z*(s*(r+v)+y);
+        r += T[0]*s;
+        w = x+r;
+        if(ix>=0x3f2ca140) {
+            v = (float)iy;
+            return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
+        }
+        if(iy==1) return w;
+        else {          /* if allow error up to 2 ulp
+                           simply return -1.0/(x+r) here */
+     /*  compute -1.0/(x+r) accurately */
+            float a,t;
+            int i;
+            z  = w;
+            GEN_OCL_GET_FLOAT_WORD(i,z);
+            GEN_OCL_SET_FLOAT_WORD(z,i&0xfffff000);
+            v  = r-(z - x);     /* z+v = r+x */
+            t = a  = -(float)1.0/w;     /* a = -1.0/w */
+            GEN_OCL_GET_FLOAT_WORD(i,t);
+            GEN_OCL_SET_FLOAT_WORD(t,i&0xfffff000);
+            s  = (float)1.0+t*z;
+            return t+a*(s+t*v);
+        }
+}
+
+INLINE_OVERLOADABLE float tan(float x)
+{
+  /* copied from fdlibm */
+        const float pio2_hi = 0x1.92p-0, pio2_mid = 0x1.fb4p-12, pio2_low = 0x1.4442d2p-24;
+        const float pio4  =  7.8539812565e-01;
+        float y[2],z=0.0;
+        int n, ix;
+
+        GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+    /* |x| ~< pi/4 */
+        ix &= 0x7fffffff;
+        if(ix <= 0x3f490fda) return __kernel_tanf(x,z,1);
+
+    /* tan(Inf or NaN) is NaN */
+        else if (ix>=0x7f800000) return x-x;            /* NaN */
+
+    /* argument reduction needed */
+      else {
+        n = __ieee754_rem_pio2f(x,y);
+
+        x = y[0];
+        float m = y[1];
+        int iy = 1-((n&1)<<1);
+        GEN_OCL_GET_FLOAT_WORD(ix,x);
+        float sign = 1.0f;
+        if(ix < 0) {
+          x = -x; m = -m;
+          sign = -1.0f;
+        }
+
+        if(x > pio4) {/* reduce x to less than pi/4 through (pi/2-x) */
+          float t = __kernel_tanf(pio2_hi-x+pio2_mid+pio2_low, -m, 1);
+          if(iy == -1) return sign*(-t); else return sign*1/t;
+        } else
+            return __kernel_tanf(y[0],y[1],1-((n&1)<<1)); /*   1 -- n even
+                                                              -1 -- n odd */
+      }
+}
+
 INLINE_OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
   return __gen_ocl_cos(x * M_PI_F);
@@ -1266,9 +2114,129 @@ INLINE_OVERLOADABLE float lgamma_r(float x, private int *signgamp) { BODY; }
 INLINE_OVERLOADABLE float native_log10(float x) {
   return native_log2(x) * 0.3010299956f;
 }
-INLINE_OVERLOADABLE float log1p(float x) { return native_log(x + 1); }
-INLINE_OVERLOADABLE float logb(float x) { return __gen_ocl_rndd(native_log2(x)); }
-INLINE_OVERLOADABLE int ilogb(float x) { return __gen_ocl_rndd(native_log2(x)); }
+INLINE_OVERLOADABLE float log1p(float x) {
+/*
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  const float
+  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
+  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
+  two25 =    3.355443200e+07, /* 0x4c000000 */
+  Lp1 = 6.6666668653e-01, /* 3F2AAAAB */
+  Lp2 = 4.0000000596e-01, /* 3ECCCCCD */
+  Lp3 = 2.8571429849e-01, /* 3E924925 */
+  Lp4 = 2.2222198546e-01, /* 3E638E29 */
+  Lp5 = 1.8183572590e-01, /* 3E3A3325 */
+  Lp6 = 1.5313838422e-01, /* 3E1CD04F */
+  Lp7 = 1.4798198640e-01; /* 3E178897 */
+  const float zero = 0.0;
+  float hfsq,f,c,s,z,R,u;
+  int k,hx,hu,ax;
+  union {float f; unsigned i;} un;
+  un.f = x;  hx = un.i;
+  ax = hx&0x7fffffff;
+
+  k = 1;
+  if (hx < 0x3ed413d7) {      /* x < 0.41422  */
+      if(ax>=0x3f800000) {    /* x <= -1.0 */
+    if(x==(float)-1.0) return -two25/zero; /* log1p(-1)=+inf */
+    else return (x-x)/(x-x);  /* log1p(x<-1)=NaN */
+      }
+      if(ax<0x31000000) {     /* |x| < 2**-29 */
+    if(two25+x>zero     /* raise inexact */
+              &&ax<0x24800000)    /* |x| < 2**-54 */
+        return x;
+    else
+        return x - x*x*(float)0.5;
+      }
+      if(hx>0||hx<=((int)0xbe95f61f)) {
+    k=0;f=x;hu=1;}  /* -0.2929<x<0.41422 */
+  }
+  if (hx >= 0x7f800000) return x+x;
+  if(k!=0) {
+      if(hx<0x5a000000) {
+    u  = (float)1.0+x;
+
+    un.f = u; hu = un.i;
+          k  = (hu>>23)-127;
+    /* correction term */
+          c  = (k>0)? (float)1.0-(u-x):x-(u-(float)1.0);
+    c /= u;
+      } else {
+    u  = x;
+    un.f = u; hu = un.i;
+          k  = (hu>>23)-127;
+    c  = 0;
+      }
+      hu &= 0x007fffff;
+      if(hu<0x3504f7) {
+          un.i = hu|0x3f800000; u = un.f;/* normalize u */
+      } else {
+          k += 1;
+          un.i = hu|0x3f000000; u = un.f;  /* normalize u/2 */
+          hu = (0x00800000-hu)>>2;
+      }
+      f = u-(float)1.0;
+  }
+  hfsq=(float)0.5*f*f;
+  if(hu==0) { /* |f| < 2**-20 */
+      if(f==zero) { if(k==0) return zero;
+      else {c += k*ln2_lo; return k*ln2_hi+c;} }
+      R = hfsq*((float)1.0-(float)0.66666666666666666*f);
+      if(k==0) return f-R; else
+             return k*ln2_hi-((R-(k*ln2_lo+c))-f);
+  }
+  s = f/((float)2.0+f);
+  z = s*s;
+  R = z*(Lp1+z*(Lp2+z*(Lp3+z*(Lp4+z*(Lp5+z*(Lp6+z*Lp7))))));
+  if(k==0) return f-(hfsq-s*(hfsq+R)); else
+     return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
+
+}
+INLINE_OVERLOADABLE float logb(float x) {
+union {float f; unsigned i;} u;
+  u.f = x;
+  int e =  ((u.i & 0x7f800000) >> 23);
+  if(e == 0) {
+    /* sub normal or +/-0 */
+    return -INFINITY;
+  } else if(e == 0xff) {
+    /* inf & nan */
+    return x*x;
+  } else {
+    return (float)(e-127);
+  }
+}
+#define FP_ILOGB0 (-0x7FFFFFFF-1)
+#define FP_ILOGBNAN FP_ILOGB0
+INLINE_OVERLOADABLE int ilogb(float x) {
+  union { int i; float f; } u;
+  if (isnan(x))
+    return FP_ILOGBNAN;
+  if (isinf(x))
+    return 0x7FFFFFFF;
+  u.f = x;
+  u.i &= 0x7fffffff;
+  if (u.i == 0)
+    return FP_ILOGB0;
+  if (u.i >= 0x800000)
+    return (u.i >> 23) - 127;
+  int r = -126;
+  int a = u.i & 0x7FFFFF;
+  while(a < 0x800000) {
+    a <<= 1;
+    r --;
+  }
+  return r;
+}
 INLINE_OVERLOADABLE float nan(uint code) {
   return NAN;
 }
@@ -1280,116 +2248,212 @@ INLINE_OVERLOADABLE float native_tan(float x) {
 INLINE_OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
   return native_tan(x * M_PI_F);
 }
-INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_pow(M_E_F, x); }
+INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(x); }
 INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_pow(2, x); }
 INLINE_OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) { return __gen_ocl_pow(M_E_F, x) - 1; }
 INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
-  return __gen_ocl_pow(x, 0.3333333333f);
+  /* copied from fdlibm */
+  const unsigned
+  B1 = 709958130, /* B1 = (84+2/3-0.03306235651)*2**23 */
+  B2 = 642849266; /* B2 = (76+2/3-0.03306235651)*2**23 */
+
+  const float
+  C =  5.4285717010e-01, /* 19/35     = 0x3f0af8b0 */
+  D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
+  E =  1.4142856598e+00, /* 99/70     = 0x3fb50750 */
+  F =  1.6071428061e+00, /* 45/28     = 0x3fcdb6db */
+  G =  3.5714286566e-01; /* 5/14      = 0x3eb6db6e */
+
+  float r,s,t, w;
+  int hx;
+  uint sign;
+  uint high;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  sign=hx&0x80000000;     /* sign= sign(x) */
+  hx  ^=sign;
+  if(hx>=0x7f800000) return(x+x); /* cbrt(NaN,INF) is itself */
+  if(hx==0)
+      return(x);    /* cbrt(0) is itself */
+
+  GEN_OCL_SET_FLOAT_WORD(x,hx); /* x <- |x| */
+    /* rough cbrt to 5 bits */
+  if(hx<0x00800000)     /* subnormal number */
+    {
+    //SET_FLOAT_WORD(t,0x4b800000); /* set t= 2**24 */
+     //t*=x; GET_FLOAT_WORD(high,t); SET_FLOAT_WORD(t,high/3+B2);
+      t = (sign = 0) ? 0.0f : -0.0f;
+      return t;
+    }
+  else
+    GEN_OCL_SET_FLOAT_WORD(t,hx/3+B1);
+
+
+    /* new cbrt to 23 bits */
+  r=t*t/x;
+  s=C+r*t;
+  t*=G+F/(s+E+D/s);
+    /* one step newton iteration to 53 bits with error less than 0.667 ulps */
+  s=t*t;    /* t*t is exact */
+  r=x/s;
+  w=t+t;
+  r=(r-t)/(w+r);  /* r-s is exact */
+  t=t+t*r;
+
+    /* retore the sign bit */
+  GEN_OCL_GET_FLOAT_WORD(high,t);
+  GEN_OCL_SET_FLOAT_WORD(t,high|sign);
+  return(t);
 }
+
 #define BODY \
-  *cosval = native_cos(x); \
-  return native_sin(x);
+  *cosval = cos(x); \
+  return sin(x);
 INLINE_OVERLOADABLE float sincos(float x, global float *cosval) { BODY; }
 INLINE_OVERLOADABLE float sincos(float x, local float *cosval) { BODY; }
 INLINE_OVERLOADABLE float sincos(float x, private float *cosval) { BODY; }
 #undef BODY
 
-INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x) {
-  return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
-  return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
-  float y = native_exp(-2 * x);
-  return (1 - y) / (1 + y);
+INLINE float __gen_ocl_asin_util(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  float
+  pS0 =  1.66666666666666657415e-01,
+  pS1 = -3.25565818622400915405e-01,
+  pS2 =  2.01212532134862925881e-01,
+  pS3 = -4.00555345006794114027e-02,
+  pS4 =  7.91534994289814532176e-04,
+  pS5 =  3.47933107596021167570e-05,
+  qS1 = -2.40339491173441421878e+00,
+  qS2 =  2.02094576023350569471e+00,
+  qS3 = -6.88283971605453293030e-01,
+  qS4 =  7.70381505559019352791e-02;
+
+  float t = x*x;
+  float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*(pS4+t*pS5)))));
+  float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
+  float w = p / q;
+  return x + x*w;
 }
 
-typedef union
-{
-  float value;
-  int word;
-} ieee_float_shape_type;
-
-#ifndef GET_FLOAT_WORD
-#define GET_FLOAT_WORD(i,d)         \
-do {                                \
-  ieee_float_shape_type gf_u;       \
-  gf_u.value = (d);                 \
-  (i) = gf_u.word;                  \
-} while (0)
-#endif
-
 INLINE_OVERLOADABLE float __gen_ocl_internal_asin(float x) {
-  int hx, ix;
-  GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
+  uint ix;
+  union { uint i; float f; } u;
+  u.f = x;
+  ix = u.i & 0x7fffffff;
   if(ix == 0x3f800000) {
     return x * M_PI_2_F;  /* asin(|1|)=+-pi/2 with inexact */
   }
   if(ix > 0x3f800000) {            /* |x|>= 1 */
-    return (x-x) / (x-x);          /* asin(|x|>1) is NaN */
+    return  NAN;          /* asin(|x|>1) is NaN */
   }
+
   if(ix < 0x32000000) {            /* if |x| < 2**-27 */
     if(HUGE_VALF + x > FLT_ONE) return x;   /* return x with inexact if x!=0*/
   }
-  /* 1 > |x| >= 2**-27 */
-  float sum = x, c = x, m = 1.0;
-  int n = 1;
-  do
-  {
-    c *= (2 * n - 1) * x * x;
-    m *= (2 * n);
-    sum += ( c / m / (2 * n + 1));
-    n++;
-  }while( n < 30);
-  return sum;
+
+  if(x < -0.5) {
+    return 2 * __gen_ocl_asin_util(native_sqrt((1+x) / 2)) - M_PI_2_F;
+  } else if(x > 0.5) {
+    return M_PI_2_F - 2 * __gen_ocl_asin_util(native_sqrt((1-x) / 2));
+  } else {
+    return __gen_ocl_asin_util(x);
+  }
 }
 INLINE_OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
   return __gen_ocl_internal_asin(x) / M_PI_F;
 }
 INLINE_OVERLOADABLE float __gen_ocl_internal_acos(float x) {
-  return M_PI_2_F - __gen_ocl_internal_asin(x);
+  if(x > 0.5)
+    return 2 * __gen_ocl_asin_util(native_sqrt((1-x)/2));
+  else
+    return M_PI_2_F - __gen_ocl_internal_asin(x);
 }
 INLINE_OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
   return __gen_ocl_internal_acos(x) / M_PI_F;
 }
 INLINE_OVERLOADABLE float __gen_ocl_internal_atan(float x) {
-  float a = 0, c = 1;
-  if (x <= -1) {
-    a = - M_PI_2_F;
-    x = 1 / x;
-    c = -1;
-  }
-  if (x >= 1) {
-    a = M_PI_2_F;
-    x = 1 / x;
-    c = -1;
-  }
-  a += c*x;
-  int i;
-  int sign;
-  for(i=3, sign=-1; i<63; i+=2, sign=-sign) {
-    a += c*sign*__gen_ocl_pow(x,i)/i;
+  /* copied from fdlibm */
+  float atanhi[4];
+  atanhi[0] = 4.6364760399e-01; /* atan(0.5)hi 0x3eed6338 */
+  atanhi[1] = 7.8539812565e-01; /* atan(1.0)hi 0x3f490fda */
+  atanhi[2] = 9.8279368877e-01; /* atan(1.5)hi 0x3f7b985e */
+  atanhi[3] = 1.5707962513e+00; /* atan(inf)hi 0x3fc90fda */
+
+  float atanlo[4];
+  atanlo[0] = 5.0121582440e-09; /* atan(0.5)lo 0x31ac3769 */
+  atanlo[1] =  3.7748947079e-08; /* atan(1.0)lo 0x33222168 */
+  atanlo[2] =  3.4473217170e-08; /* atan(1.5)lo 0x33140fb4 */
+  atanlo[3] =  7.5497894159e-08; /* atan(inf)lo 0x33a22168 */
+
+  float aT[11];
+  aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
+  aT[1] =  -2.0000000298e-01; /* 0xbe4ccccd */
+  aT[2] =   1.4285714924e-01; /* 0x3e124925 */
+  aT[3] =  -1.1111110449e-01; /* 0xbde38e38 */
+  aT[4] =   9.0908870101e-02; /* 0x3dba2e6e */
+  aT[5] =  -7.6918758452e-02; /* 0xbd9d8795 */
+  aT[6] =   6.6610731184e-02; /* 0x3d886b35 */
+  aT[7] =  -5.8335702866e-02; /* 0xbd6ef16b */
+  aT[8] =   4.9768779427e-02; /* 0x3d4bda59 */
+  aT[9] =  -3.6531571299e-02; /* 0xbd15a221 */
+  aT[10] =   1.6285819933e-02; /* 0x3c8569d7 */
+  const float one = 1.0, huge = 1.0e30;
+
+  float w,s1,s2,z;
+  int ix,hx,id;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  if(ix>=0x50800000) {  /* if |x| >= 2^34 */
+      if(ix>0x7f800000)
+    return x+x;   /* NaN */
+      if(hx>0) return  atanhi[3]+atanlo[3];
+      else     return -atanhi[3]-atanlo[3];
+  } if (ix < 0x3ee00000) {  /* |x| < 0.4375 */
+      if (ix < 0x31000000) {  /* |x| < 2^-29 */
+    if(huge+x>one) return x;  /* raise inexact */
+      }
+      id = -1;
+  } else {
+  x = __gen_ocl_fabs(x);
+  if (ix < 0x3f980000) {    /* |x| < 1.1875 */
+      if (ix < 0x3f300000) {  /* 7/16 <=|x|<11/16 */
+    id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
+      } else {      /* 11/16<=|x|< 19/16 */
+    id = 1; x  = (x-one)/(x+one);
+      }
+  } else {
+      if (ix < 0x401c0000) {  /* |x| < 2.4375 */
+    id = 2; x  = (x-(float)1.5)/(one+(float)1.5*x);
+      } else {      /* 2.4375 <= |x| < 2^66 */
+    id = 3; x  = -(float)1.0/x;
+      }
+  }}
+    /* end of argument reduction */
+  z = x*x;
+  w = z*z;
+    /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
+  s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*(aT[6]+w*(aT[8]+w*aT[10])))));
+  s2 = w*(aT[1]+w*(aT[3]+w*(aT[5]+w*(aT[7]+w*aT[9]))));
+  if (id<0) return x - x*(s1+s2);
+  else {
+      z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
+      return (hx<0)? -z:z;
   }
-  return a;
+
 }
 INLINE_OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
   return __gen_ocl_internal_atan(x) / M_PI_F;
 }
-INLINE_OVERLOADABLE float __gen_ocl_internal_asinh(float x) {
-  return native_log(x + native_sqrt(x * x + 1));
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
-  return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
-  return 0.5f * native_sqrt((1 + x) / (1 - x));
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
-  return x * y < 0 ? -x : x;
-}
 INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
   return M_2_SQRTPI_F * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 10 - __gen_ocl_pow(x, 7) / 42 + __gen_ocl_pow(x, 9) / 216);
 }
@@ -1401,25 +2465,86 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
 #define sqrt native_sqrt
 INLINE_OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
-  uint hx = *(uint *)(&x), ix = hx & 0x7FFFFFFF;
-  uint hy = *(uint *)(&y), iy = hy & 0x7FFFFFFF;
-  if (ix > 0x7F800000 || iy > 0x7F800000)
-    return nan(0u);
-  if (ix == 0) {
-    if (y > 0)
-      return M_PI_2_F;
-    if (y < 0)
-      return - M_PI_2_F;
-    return nan(0u);
-  } else {
-    float z = __gen_ocl_internal_atan(y / x);
-    if (x > 0)
-      return z;
-    if (y >= 0)
-      return M_PI_F + z;
-    return - M_PI_F + z;
+  /* copied from fdlibm */
+  float z;
+  int k,m,hx,hy,ix,iy;
+  const float
+  tiny  = 1.0e-30,
+  zero  = 0.0,
+  pi_o_4  = 7.8539818525e-01, /* 0x3f490fdb */
+  pi_o_2  = 1.5707963705e+00, /* 0x3fc90fdb */
+  pi      = 3.1415927410e+00, /* 0x40490fdb */
+  pi_lo   = -8.7422776573e-08; /* 0xb3bbbd2e */
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  GEN_OCL_GET_FLOAT_WORD(hy,y);
+  iy = hy&0x7fffffff;
+
+  if((ix>0x7f800000)||
+     (iy>0x7f800000)) /* x or y is NaN */
+     return x+y;
+  if(hx==0x3f800000) return z=__gen_ocl_internal_atan(y);   /* x=1.0 */
+  m = ((hy>>31)&1)|((hx>>30)&2);  /* 2*sign(x)+sign(y) */
+
+    /* when y = 0 */
+  if(iy==0) {
+      switch(m) {
+    case 0:
+    case 1: return y;   /* atan(+-0,+anything)=+-0 */
+    case 2: return  pi+tiny;/* atan(+0,-anything) = pi */
+    case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
+      }
+  }
+    /* when x = 0 */
+  if(ix==0) return (hy<0)?  -pi_o_2-tiny: pi_o_2+tiny;
+
+  /* both are denorms. Gen does not support denorm, so we convert to normal float number*/
+  if(ix <= 0x7fffff && iy <= 0x7fffff) {
+    x = (float)(ix) * (1.0f - ((hx>>30) & 0x2));
+    y = (float)(iy) * (1.0f - ((hy>>30) & 0x2));
+  }
+
+    /* when x is INF */
+  if(ix==0x7f800000) {
+      if(iy==0x7f800000) {
+    switch(m) {
+        case 0: return  pi_o_4+tiny;/* atan(+INF,+INF) */
+        case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
+        case 2: return  (float)3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
+        case 3: return (float)-3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
+    }
+      } else {
+    switch(m) {
+        case 0: return  zero  ; /* atan(+...,+INF) */
+        case 1: return -zero  ; /* atan(-...,+INF) */
+        case 2: return  pi+tiny  ;  /* atan(+...,-INF) */
+        case 3: return -pi-tiny  ;  /* atan(-...,-INF) */
+    }
+      }
+  }
+    /* when y is INF */
+  if(iy==0x7f800000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
+
+    /* compute y/x */
+  k = (iy-ix)>>23;
+  if(k > 60) z=pi_o_2+(float)0.5*pi_lo;   /* |y/x| >  2**60 */
+  else if(hx<0&&k<-60) z=0.0;   /* |y|/x < -2**60 */
+  else z=__gen_ocl_internal_atan(__gen_ocl_fabs(y/x)); /* safe to do y/x */
+  switch (m) {
+      case 0: return       z  ; /* atan(+,+) */
+      case 1: {
+              uint zh;
+          GEN_OCL_GET_FLOAT_WORD(zh,z);
+          GEN_OCL_SET_FLOAT_WORD(z,zh ^ 0x80000000);
+        }
+        return       z  ; /* atan(-,+) */
+      case 2: return  pi-(z-pi_lo);/* atan(+,-) */
+      default: /* case 3 */
+            return  (z-pi_lo)-pi;/* atan(-,-) */
   }
 }
+
 INLINE_OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
   uint ix = as_uint(x), iy = as_uint(y),
        pos_zero = 0, neg_zero = 0x80000000u,
@@ -1482,33 +2607,553 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
 }
 INLINE_OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) { return __gen_ocl_rnde(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) {
+  float y = __gen_ocl_rndz(x);
+  if (__gen_ocl_fabs(x - y) >= 0.5f)
+    y += __gen_ocl_internal_copysign(1.f, x);
+  return y;
+}
 INLINE_OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_log(float x)   { return native_log(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_log2(float x)  { return native_log2(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_log10(float x) { return native_log10(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x)   { return native_exp(x); }
 INLINE_OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
-INLINE_OVERLOADABLE float fmod(float x, float y) { return x-y*__gen_ocl_rndz(x/y); }
-INLINE_OVERLOADABLE float remainder(float x, float y) { return x-y*__gen_ocl_rnde(x/y); }
 INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
-  return 2 * __gen_ocl_internal_round(x / 2);
+  return __gen_ocl_rnde(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x) {
+  //use native instruction when it has enough precision
+  if (x > 128 || x < -128)
+  {
+    return native_exp(x);
+  }
+
+  float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
+  u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
+  twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
+  ivln2	 =	1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  one = 1.0,
+  huge = 1.0e+30,
+  P1 = 1.6666667163e-01, /* 0x3e2aaaab */
+  P2 = -2.7777778450e-03, /* 0xbb360b61 */
+  P3 = 6.6137559770e-05, /* 0x388ab355 */
+  P4 = -1.6533901999e-06, /* 0xb5ddea0e */
+  P5 =	4.1381369442e-08; /* 0x3331bb4c */
+  float ln2HI[2],ln2LO[2],halF[2];
+  float y,hi=0.0,lo=0.0,c,t;
+  int k=0,xsb;
+  unsigned hx;
+  ln2HI[0] = 6.9313812256e-01;	/* 0x3f317180 */
+  ln2HI[1] = -6.9313812256e-01;	/* 0xbf317180 */
+  ln2LO[0] = 9.0580006145e-06;  	/* 0x3717f7d1 */
+  ln2LO[1] = -9.0580006145e-06; /* 0xb717f7d1 */
+  halF[0] = 0.5;
+  halF[1] =	-0.5;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  xsb = (hx>>31)&1;		/* sign bit of x */
+  hx &= 0x7fffffff;		/* high word of |x| */
+
+  /* filter out non-finite argument */
+  if(hx >= 0x42b17218) {			/* if |x|>=88.721... */
+    if(hx>0x7f800000)
+      return x+x;			/* NaN */
+    if(hx==0x7f800000)
+      return (xsb==0)? x:0.0; 	/* exp(+-inf)={inf,0} */
+    if(x > o_threshold) return huge*huge; /* overflow */
+    if(x < u_threshold) return twom100*twom100; /* underflow */
+  }
+  /* argument reduction */
+  if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */
+    if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */
+      hi = x-ln2HI[xsb]; lo=ln2LO[xsb]; k = 1-xsb-xsb;
+    } else {
+      k  = ivln2*x+halF[xsb];
+      t  = k;
+      hi = x - t*ln2HI[0];	/* t*ln2HI is exact here */
+      lo = t*ln2LO[0];
+    }
+    x  = hi - lo;
+  }
+  else if(hx < 0x31800000)  { /* when |x|<2**-28 */
+    if(huge+x>one) return one+x;/* trigger inexact */
+  }
+  else k = 0;
+
+  /* x is now in primary range */
+  t  = x*x;
+  c  = x - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+  if(k==0)
+    return one-((x*c)/(c-(float)2.0)-x);
+  else
+    y = one-((lo-(x*c)/((float)2.0-c))-hi);
+  if(k >= -125) {
+    unsigned hy;
+    GEN_OCL_GET_FLOAT_WORD(hy,y);
+    GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23));	/* add k to y's exponent */
+    return y;
+  } else {
+    unsigned hy;
+    GEN_OCL_GET_FLOAT_WORD(hy,y);
+    GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
+    return y*twom100;
+  }
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
+  //return x-y*__gen_ocl_rndz(x/y);
+  float one = 1.0;
+  float Zero[2];
+  int n,hx,hy,hz,ix,iy,sx,i;
+  Zero[0] = 0.0;
+  Zero[1] = -0.0;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_GET_FLOAT_WORD(hy,y);
+  sx = hx&0x80000000;		/* sign of x */
+  hx ^=sx;		/* |x| */
+  hy &= 0x7fffffff;	/* |y| */
+  /* purge off exception values */
+  if(hy==0||(hx>=0x7f800000)||		/* y=0,or x not finite */
+  (hy>0x7f800000))			/* or y is NaN */
+    return (x*y)/(x*y);
+  if(hx<hy) return x;			/* |x|<|y| return x */
+  if(hx==hy)
+    return Zero[(unsigned)sx>>31];	/* |x|=|y| return x*0*/
+
+  /* determine ix = ilogb(x) */
+  if(hx<0x00800000) {	/* subnormal x */
+    for (ix = -126,i=(hx<<8); i>0; i<<=1) ix -=1;
+  } else ix = (hx>>23)-127;
+
+  /* determine iy = ilogb(y) */
+  if(hy<0x00800000) {	/* subnormal y */
+    for (iy = -126,i=(hy<<8); i>=0; i<<=1) iy -=1;
+  } else iy = (hy>>23)-127;
+
+  /* set up {hx,lx}, {hy,ly} and align y to x */
+  if(ix >= -126)
+    hx = 0x00800000|(0x007fffff&hx);
+  else {		/* subnormal x, shift x to normal */
+    n = -126-ix;
+    hx = hx<<n;
+  }
+  if(iy >= -126)
+    hy = 0x00800000|(0x007fffff&hy);
+  else {		/* subnormal y, shift y to normal */
+    n = -126-iy;
+    hy = hy<<n;
+  }
+  /* fix point fmod */
+  n = ix - iy;
+  while(n--) {
+    hz=hx-hy;
+    if(hz<0){hx = hx+hx;}
+    else {
+      if(hz==0)		/* return sign(x)*0 */
+        return Zero[(unsigned)sx>>31];
+      hx = hz+hz;
+    }
+  }
+  hz=hx-hy;
+  if(hz>=0) {hx=hz;}
+
+    /* convert back to floating value and restore the sign */
+  if(hx==0)			/* return sign(x)*0 */
+    return Zero[(unsigned)sx>>31];
+  while(hx<0x00800000) {		/* normalize x */
+    hx = hx+hx;
+    iy -= 1;
+  }
+  if(iy>= -126) {		/* normalize output */
+    hx = ((hx-0x00800000)|((iy+127)<<23));
+	GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
+   } else {		/* subnormal output */
+     n = -126 - iy;
+     hx >>= n;
+     GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
+     x *= one;		/* create necessary signal */
+  }
+  return x;		/* exact output */
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
+  //return __gen_ocl_pow(M_E_F, x) - 1;
+  float	Q1 = -3.3333335072e-02, /* 0xbd088889 */
+  ln2_hi = 6.9313812256e-01,	/* 0x3f317180 */
+  ln2_lo = 9.0580006145e-06,	/* 0x3717f7d1 */
+  Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
+  Q3 = -7.9365076090e-05, /* 0xb8a670cd */
+  Q4 = 4.0082177293e-06, /* 0x36867e54 */
+  Q5 = -2.0109921195e-07, /* 0xb457edbb */
+  huge = 1.0e30,
+  tiny = 1.0e-30,
+  ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  one	=  1.0,
+  o_threshold=  8.8721679688e+01;  /* 0x42b17180 */
+  float y,hi,lo,c,t,e,hxs,hfx,r1;
+  int k,xsb;
+  int hx;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  xsb = hx&0x80000000;
+  /* sign bit of x */
+  //if(xsb==0)
+  //y=x;
+  //else
+  //y= -x; /* y = |x| */
+  y = __gen_ocl_internal_fabs(x);
+  hx &= 0x7fffffff;		/* high word of |x| */
+  /* filter out huge and non-finite argument */
+  if(hx >= 0x4195b844) {			/* if |x|>=27*ln2 */
+    if(hx >= 0x42b17218) {		/* if |x|>=88.721... */
+      if(hx>0x7f800000)
+        return x+x; 	 /* NaN */
+      if(hx==0x7f800000)
+        return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
+      if(x > o_threshold)
+        return huge*huge; /* overflow */
+    }
+    if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
+      if(x+tiny<(float)0.0)	/* raise inexact */
+        return tiny-one;	/* return -1 */
+    }
+  }
+  /* argument reduction */
+  if(hx > 0x3eb17218) {/* if  |x| > 0.5 ln2 */
+    if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
+      if(xsb==0){
+        hi = x - ln2_hi; lo = ln2_lo;  k =  1;
+      }	else {
+        hi = x + ln2_hi; lo = -ln2_lo;  k = -1;
+      }
+    } else {
+      k  = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
+      t  = k;
+      hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
+      lo = t*ln2_lo;
+    }
+    x  = hi - lo;
+    c  = (hi-x)-lo;
+  } else if(hx < 0x33000000) {	/* when |x|<2**-25, return x */
+    //t = huge+x; /* return x with inexact flags when x!=0 */
+    //return x - (t-(huge+x));
+    return x;
+  } else k = 0;
+  /* x is now in primary range */
+  hfx = (float)0.5*x;
+  hxs = x*hfx;
+  r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+  t = (float)3.0-r1*hfx;
+  e = hxs*((r1-t)/((float)6.0 - x*t));
+  if(k==0)
+    return x - (x*e-hxs);		/* c is 0 */
+  else{
+    e = (x*(e-c)-c);
+    e -= hxs;
+    if(k== -1)return (float)0.5*(x-e)-(float)0.5;
+    if(k==1){
+      if(x < (float)-0.25)
+        return -(float)2.0*(e-(x+(float)0.5));
+      else
+        return  (one+(float)2.0*(x-e));
+    }
+    if (k <= -2 || k>56) {	 /* suffice to return exp(x)-1 */
+      int i;
+      y = one-(e-x);
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+      return y-one;
+    }
+    t = one;
+    if(k<23) {
+      int i;
+      GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
+      y = t-(e-x);
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+    } else {
+      int i;
+      GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23));	/* 2^-k */
+      y = x-(e+t);
+      y += one;
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+    }
+  }
+  return y;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
+  //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+  float one	= 1.0,
+  ln2	= 6.9314718246e-01;/* 0x3f317218 */
+  float t;
+  int hx;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  if(hx<0x3f800000) {	/* x < 1 */
+    return (x-x)/(x-x);
+  } else if(hx >=0x4d800000) {	/* x > 2**28 */
+    if(hx >=0x7f800000) {/* x is inf of NaN */
+      return x+x;
+    } else
+      return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
+  } else if (hx==0x3f800000) {
+    return 0.0;			/* acosh(1) = 0 */
+  } else if (hx > 0x40000000) {	/* 2**28 > x > 2 */
+    t=x*x;
+    return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));			
+  } else {			/* 1<x<2 */
+    t = x-one;
+    return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
+  }
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_asinh(float x){
+  //return native_log(x + native_sqrt(x * x + 1));
+  float one =  1.0000000000e+00, /* 0x3F800000 */
+  ln2 =  6.9314718246e-01, /* 0x3f317218 */
+  huge=  1.0000000000e+30;
+  float w;
+  int hx,ix;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  if(ix< 0x38000000) {	/* |x|<2**-14 */
+    if(huge+x>one) return x;	/* return x inexact except 0 */
+  }
+  if(ix>0x47000000) {/* |x| > 2**14 */
+    if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
+    w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
+  } else {
+    float xa = __gen_ocl_internal_fabs(x);
+    if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
+      w = __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa));
+    } else {		/* 2.0 > |x| > 2**-14 */
+      float t = xa*xa;
+      w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
+    }
+  }
+  return __gen_ocl_internal_copysign(w, x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x){
+  //return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+  float one = 1.0,
+  shuge = 1.0e37;
+  float t,w,h;
+  int ix,jx;
+  GEN_OCL_GET_FLOAT_WORD(jx,x);
+  ix = jx&0x7fffffff;
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) return x+x;
+  h = 0.5;
+  if (jx<0) h = -h;
+  /* |x| in [0,22], return sign(x)*0.5*(E+E/(E+1))) */
+  if (ix < 0x41b00000) {		/* |x|<22 */
+    if (ix<0x31800000)	/* |x|<2**-28 */
+      if(shuge+x>one) return x;/* sinh(tiny) = tiny with inexact */
+    t = __gen_ocl_internal_expm1(__gen_ocl_internal_fabs(x));
+    if(ix<0x3f800000) return h*((float)2.0*t-t*t/(t+one));
+      return h*(t+t/(t+one));
+  }
+  /* |x| in [22, log(maxdouble)] return 0.5*exp(|x|) */
+  if (ix < 0x42b17180)  return h*__gen_ocl_internal_exp(__gen_ocl_internal_fabs(x));
+  /* |x| in [log(maxdouble), overflowthresold] */
+  if (ix<=0x42b2d4fc) {
+    w = __gen_ocl_internal_exp((float)0.5*__gen_ocl_internal_fabs(x));
+    t = h*w;
+    return t*w;
+  }
+  /* |x| > overflowthresold, sinh(x) overflow */
+  return x*shuge;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
+  //float y = native_exp(-2 * x);
+  //return (1 - y) / (1 + y);
+  float one=1.0, two=2.0, tiny = 1.0e-30;
+  float t,z;
+  int jx,ix;
+  GEN_OCL_GET_FLOAT_WORD(jx,x);
+  ix = jx&0x7fffffff;
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) {
+    if (jx>=0)
+      return one/x+one; /* tanh(+-inf)=+-1 */
+    else
+      return one/x-one; /* tanh(NaN) = NaN */
+  }
+
+  if (ix < 0x41b00000) { /* |x|<22 */
+    if (ix == 0)
+      return x;		/* x == +-0 */
+    if (ix<0x24000000) 	/* |x|<2**-55 */
+      return x*(one+x);    	/* tanh(small) = small */
+    if (ix>=0x3f800000) {	/* |x|>=1  */
+      t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
+      z = one - two/(t+two);
+    } else {
+      t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
+      z= -t/(t+two);
+    }
+  } else { /* |x| > 22, return +-1 */
+    z = one - tiny;		/* raised inexact flag */
+  }
+  return (jx>=0)? z: -z;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
+  //return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+  float halF = 0.5,
+  huge = 1.0e+30,
+  tiny = 1.0e-30,
+  one = 1.0;
+  float t,w;
+  int ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  ix &= 0x7fffffff;
+  /* |x| in [0,22] */
+  if (ix < 0x41b00000) {
+    /* |x| in [0,0.5*ln2], return 1+expm1(|x|)^2/(2*exp(|x|)) */
+    if(ix<0x3eb17218) {
+      t = __gen_ocl_internal_expm1(__gen_ocl_fabs(x));
+      w = one+t;
+      if (ix<0x24000000) return w;	/* cosh(tiny) = 1 */
+      return one+(t*t)/(w+w);
+    }
+    /* |x| in [0.5*ln2,22], return (exp(|x|)+1/exp(|x|)/2; */
+    t = __gen_ocl_internal_exp(__gen_ocl_fabs(x));
+    return halF*t+halF/t;
+  }
+  /* |x| in [22, log(maxdouble)] return half*exp(|x|) */
+  if (ix < 0x42b17180)  return halF*__gen_ocl_internal_exp(__gen_ocl_fabs(x));
+  /* |x| in [log(maxdouble), overflowthresold] */
+  if (ix<=0x42b2d4fc) {
+    w = __gen_ocl_internal_exp(halF*__gen_ocl_fabs(x));
+    t = halF*w;
+    return t*w;
+  }
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) return x*x;
+  /* |x| > overflowthresold, cosh(x) overflow */
+  return huge*huge;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
+  //return x-y*__gen_ocl_rnde(x/y);
+  float zero = 0.0;
+  int hx,hp;
+  unsigned sx;
+  float p_half;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_GET_FLOAT_WORD(hp,p);
+  sx = hx&0x80000000;
+  hp &= 0x7fffffff;
+  hx &= 0x7fffffff;
+  /* purge off exception values */
+  if(hp==0) return (x*p)/(x*p);	        /* p = 0 */
+  if((hx>=0x7f800000)||               /* x not finite */
+    ((hp>0x7f800000)))	               /* p is NaN */
+    return (x*p)/(x*p);
+  if (hp<=0x7effffff) x = __gen_ocl_internal_fmod(x,p+p); /* now x < 2p */
+  if ((hx-hp)==0) return zero*x;
+  x = __gen_ocl_fabs(x);
+  p = __gen_ocl_fabs(p);
+  if (hp<0x01000000) {
+    if(x+x>p) {
+      x-=p;
+      if(x+x>=p) x -= p;
+    }
+  } else {
+    p_half = (float)0.5*p;
+    if(x>p_half) {
+      x-=p;
+      if(x>=p_half) x -= p;
+    }
+  }
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_SET_FLOAT_WORD(x,hx^sx);
+  return x;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
+  if(!__ocl_finitef(x)||x==(float)0.0) return x;
+  x = __gen_ocl_scalbnf(x,n);
+  return x;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
+  //return 0.5f * native_sqrt((1 + x) / (1 - x));
+  float xa = __gen_ocl_fabs (x);
+  float t;
+  if (isless (xa, 0.5f)){
+    if (xa < 0x1.0p-28f) return x;
+    t = xa + xa;
+    t = 0.5f * log1p (t + t * xa / (1.0f - xa));
+  } else if (isless (xa, 1.0f)){
+    t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
+  } else{
+    if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
+    return x / 0.0f;
+  }
+  return __gen_ocl_internal_copysign(t, x);
 }
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_exp10(float x){
+  float px, qx,ans;
+  short n;
+  int i;
+  float*p;
+  float MAXL10 = 38.230809449325611792;
+  float LOG210 = 3.32192809488736234787e0;
+  float LG102A = 3.00781250000000000000E-1;
+  float LG102B = 2.48745663981195213739E-4;
+  float P[6];
+  P[0] = 2.063216740311022E-001;
+  P[1] = 5.420251702225484E-001;
+  P[2] = 1.171292686296281E+000;
+  P[3] = 2.034649854009453E+000;
+  P[4] = 2.650948748208892E+000;
+  P[5] = 2.302585167056758E+000;
+  if( isinf(x))
+    return INFINITY;
+
+  if( x < -MAXL10 )return 0.0;
+  /* The following is necessary because range reduction blows up: */
+  if( x == 0 )return 1.0;
+
+  /* Express 10**x = 10**g 2**n
+    *	 = 10**g 10**( n log10(2) )
+    *	 = 10**( g + n log10(2) )
+    */
+  px = x * LOG210;
+  qx = __gen_ocl_internal_floor( px + 0.5 );
+  n = qx;
+  x -= qx * LG102A;
+  x -= qx * LG102B;
+
+  /* rational approximation for exponential
+    * of the fractional part:
+    * 10**x - 1  =  2x P(x**2)/( Q(x**2) - P(x**2) )
+    */
+  p = P;
+  ans = *p++;
+  i = 5;
+  do{
+    ans = ans * x  +  *p++;
+  }
+  while( --i );
+  px = 1.0 + x * ans;
+
+  /* multiply by power of 2 */
+  x = __gen_ocl_internal_ldexp( px, n );
+  return x;
+}
+
 // TODO use llvm intrinsics definitions
-#define cos native_cos
 #define cospi __gen_ocl_internal_cospi
 #define cosh __gen_ocl_internal_cosh
 #define acos __gen_ocl_internal_acos
 #define acospi __gen_ocl_internal_acospi
 #define acosh __gen_ocl_internal_acosh
-#define sin native_sin
 #define sinpi __gen_ocl_internal_sinpi
 #define sinh __gen_ocl_internal_sinh
 #define asin __gen_ocl_internal_asin
 #define asinpi __gen_ocl_internal_asinpi
 #define asinh __gen_ocl_internal_asinh
-#define tan native_tan
 #define tanpi __gen_ocl_internal_tanpi
 #define tanh __gen_ocl_internal_tanh
 #define atan __gen_ocl_internal_atan
@@ -1522,9 +3167,12 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
 #define copysign __gen_ocl_internal_copysign
 #define erf __gen_ocl_internal_erf
 #define erfc __gen_ocl_internal_erfc
-
+#define fmod __gen_ocl_internal_fmod
+#define remainder __gen_ocl_internal_remainder
+#define ldexp __gen_ocl_internal_ldexp
+PURE CONST float __gen_ocl_mad(float a, float b, float c);
 INLINE_OVERLOADABLE float mad(float a, float b, float c) {
-  return a*b+c;
+  return __gen_ocl_mad(a, b, c);
 }
 
 #define DEF(TYPE1, TYPE2) \
@@ -1568,7 +3216,6 @@ INLINE_OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
 INLINE_OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
   return max(min(v, u), l); \
 }
-DECL_MIN_MAX_CLAMP(float)
 DECL_MIN_MAX_CLAMP(int)
 DECL_MIN_MAX_CLAMP(short)
 DECL_MIN_MAX_CLAMP(char)
@@ -1578,37 +3225,77 @@ DECL_MIN_MAX_CLAMP(unsigned char)
 DECL_MIN_MAX_CLAMP(long)
 DECL_MIN_MAX_CLAMP(ulong)
 #undef DECL_MIN_MAX_CLAMP
+INLINE_OVERLOADABLE float max(float a, float b) {
+  if(isnan(b))
+    return a;
+  return a > b ? a : b;
+}
+INLINE_OVERLOADABLE float min(float a, float b) {
+  if(isnan(b))
+    return a;
+  return a < b ? a : b;
+}
+INLINE_OVERLOADABLE float clamp(float v, float l, float u) {
+  return max(min(v, u), l);
+}
 
 #define BODY \
-  uint u = as_uint(x); \
-  if ((u & 0x7FFFFFFFu) == 0) { \
+  if (isnan(x) || isinf(x)) { \
     *exp = 0; \
     return x; \
   } \
-  int e = (u >> 23) & 255; \
-  if (e == 255) \
+  uint u = as_uint(x); \
+  uint a = u & 0x7FFFFFFFu; \
+  if (a == 0) { \
+    *exp = 0; \
     return x; \
-  *exp = e - 126; \
-  u = (u & (0x807FFFFFu)) | 0x3F000000; \
-  return as_float(u);
+  } \
+  if (a >= 0x800000) { \
+    *exp = (a >> 23) - 126; \
+    return as_float((u & (0x807FFFFFu)) | 0x3F000000); \
+  } \
+  int e = -126; \
+  while (a < 0x400000) { \
+    e --; \
+    a <<= 1; \
+  } \
+  a <<= 1; \
+  *exp = e; \
+  return as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
 INLINE_OVERLOADABLE float frexp(float x, global int *exp) { BODY; }
 INLINE_OVERLOADABLE float frexp(float x, local int *exp) { BODY; }
 INLINE_OVERLOADABLE float frexp(float x, private int *exp) { BODY; }
 #undef BODY
 
 INLINE_OVERLOADABLE float nextafter(float x, float y) {
-  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF;
-  uint hy = as_uint(y), iy = hy & 0x7FFFFFFF;
-  if (ix > 0x7F800000 || iy > 0x7F800000)
-    return nan(0u);
-  if (hx == hy)
-    return x;
-  if (ix == 0)
-    return as_float((hy & 0x80000000u) | 1);
-  if (((0 == (hx & 0x80000000u)) && y > x) || ((hx & 0x80000000u) && y < x))
-    hx ++;
-  else
-    hx --;
+  int hx, hy, ix, iy;
+  hx = as_int(x);
+  hy = as_int(y);
+  ix = hx & 0x7fffffff;
+  iy = hy & 0x7fffffff;
+  if(ix>0x7f800000 || iy>0x7f800000)
+    return x+y;
+  if(hx == hy)
+    return y;
+  if(ix == 0) {
+    if(iy == 0)
+      return y;
+    else
+      return as_float((hy&0x80000000) | 1);
+  }
+  if(hx >= 0) {
+    if(hx > hy) {
+      hx -= 1;
+    } else {
+      hx += 1;
+    }
+  } else {
+    if(hy >= 0 || hx > hy){
+      hx -= 1;
+    } else {
+      hx += 1;
+    }
+  }
   return as_float(hx);
 }
 
@@ -1658,10 +3345,46 @@ INLINE_OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
 }
 INLINE_OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
 INLINE_OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
-  return __gen_ocl_internal_fmax(x, y) - y;
+  if(isnan(x))
+    return x;
+  if(isnan(y))
+    return y;
+  return x > y ? (x - y) : +0.f;
 }
+INLINE_OVERLOADABLE float hypot(float x, float y) {
+  //return __gen_ocl_sqrt(x*x + y*y);
+  float a,b,an,bn,cn;
+  int e;
+  if (isfinite (x) && isfinite (y)){      /* Determine absolute values.  */
+  x = __gen_ocl_fabs (x);
+  y = __gen_ocl_fabs (y);
+  /* Find the bigger and the smaller one.  */
+  a = max(x,y);
+  b = min(x,y);
+  /* Now 0 <= b <= a.  */
+  /* Write a = an * 2^e, b = bn * 2^e with 0 <= bn <= an < 1.  */
+  an = frexp (a, &e);
+  bn = ldexp (b, - e);
+  /* Through the normalization, no unneeded overflow or underflow will occur here.  */
+  cn = __gen_ocl_sqrt (an * an + bn * bn);
+  return ldexp (cn, e);
+  }else{
+    if (isinf (x) || isinf (y))  /* x or y is infinite.  Return +Infinity.  */    
+      return INFINITY;
+    else        /* x or y is NaN.  Return NaN.  */
+      return x + y;
+  }
+}
+
 #define BODY \
+  if (isnan(x)) { \
+    *p = x; \
+    return x; \
+  } \
   *p = __gen_ocl_internal_floor(x); \
+  if (isinf(x)) { \
+    return x > 0 ? +0. : -0.; \
+  } \
   return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
 INLINE_OVERLOADABLE float fract(float x, global float *p) { BODY; }
 INLINE_OVERLOADABLE float fract(float x, local float *p) { BODY; }
@@ -1669,25 +3392,89 @@ INLINE_OVERLOADABLE float fract(float x, private float *p) { BODY; }
 #undef BODY
 
 #define BODY \
-  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF, hy = as_uint(y), iy = hy & 0x7FFFFFFF; \
-  if (ix > 0x7F800000 || iy > 0x7F800000 || ix == 0x7F800000 || iy == 0) \
-    return nan(0u); \
-  float k = x / y; \
-  int q =  __gen_ocl_rnde(k); \
-  *quo = q >= 0 ? (q & 127) : (q | 0xFFFFFF80u); \
-  float r = x - q * y; \
-  uint hr = as_uint(r), ir = hr & 0x7FFFFFFF; \
-  if (ir == 0) \
-    hr = ir | (hx & 0x80000000u); \
-  return as_float(hr);
-INLINE_OVERLOADABLE float remquo(float x, float y, global int *quo) { BODY; }
+  float Zero[2]; \
+  int n,hx,hy,hz,ix,iy,sx,i,sy; \
+  uint q,sxy; \
+  Zero[0] = 0.0;Zero[1] = -0.0; \
+  GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_GET_FLOAT_WORD(hy,y); \
+  sxy = (hx ^ hy) & 0x80000000;sx = hx&0x80000000;sy = hy&0x80000000; \
+  hx ^=sx; hy &= 0x7fffffff; \
+  if (hx < 0x00800000)hx = 0;if (hy < 0x00800000)hy = 0; \
+  if(hy==0||hx>=0x7f800000||hy>0x7f800000){ \
+    *quo = 0;return NAN; \
+  } \
+  if( hy == 0x7F800000 || hx == 0 ) { \
+    *quo = 0;return x; \
+  } \
+  if( hx == hy ) { \
+    *quo = (x == y) ? 1 : -1; \
+    return sx ? -0.0 : 0.0; \
+  } \
+  if(hx<hy) { \
+    q = 0; \
+    goto fixup; \
+  } else if(hx==hy) { \
+    *quo = (sxy ? -1 : 1); \
+    return Zero[(uint)sx>>31]; \
+  } \
+  ix = (hx>>23)-127; \
+  iy = (hy>>23)-127; \
+  hx = 0x00800000|(0x007fffff&hx); \
+  hy = 0x00800000|(0x007fffff&hy); \
+  n = ix - iy; \
+  q = 0; \
+  while(n--) { \
+    hz=hx-hy; \
+    if(hz<0) hx = hx << 1; \
+    else {hx = hz << 1; q++;} \
+    q <<= 1; \
+  } \
+  hz=hx-hy; \
+  if(hz>=0) {hx=hz;q++;} \
+  if(hx==0) { \
+    q &= 0x0000007f; \
+    *quo = (sxy ? -q : q); \
+    return Zero[(uint)sx>>31]; \
+  } \
+  while(hx<0x00800000) { \
+    hx <<= 1;iy -= 1; \
+  } \
+  if(iy>= -126) { \
+    hx = ((hx-0x00800000)|((iy+127)<<23)); \
+  } else {\
+    n = -126 - iy; \
+    hx >>= n; \
+  } \
+fixup: \
+  GEN_OCL_SET_FLOAT_WORD(x,hx); \
+  if(hx<0x00800000){ \
+    GEN_OCL_GET_FLOAT_WORD(hy,y); \
+    hy &= 0x7fffffff; \
+    if(hx+hx > hy ||(hx+hx==hy && (q & 1)))q++; \
+    x = 0; \
+  }else{ \
+    y = __gen_ocl_fabs(y); \
+    if (y < 0x1p-125f) { \
+      if (x+x>y || (x+x==y && (q & 1))) { \
+        q++;x-=y; \
+      } \
+    }else if (x>0.5f*y || (x==0.5f*y && (q & 1))) { \
+      q++;x-=y; \
+    } \
+    GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_SET_FLOAT_WORD(x,hx^sx); \
+  } \
+  int sign = sx==sy?0:1; \
+  q &= 0x0000007f; \
+  *quo = (sign ? -q : q); \
+  return x;
+
+INLINE_OVERLOADABLE float remquo(float x, float y, global int *quo) {
+	BODY;
+}
 INLINE_OVERLOADABLE float remquo(float x, float y, local int *quo) { BODY; }
 INLINE_OVERLOADABLE float remquo(float x, float y, private int *quo) { BODY; }
 #undef BODY
 INLINE_OVERLOADABLE float native_divide(float x, float y) { return x/y; }
-INLINE_OVERLOADABLE float ldexp(float x, int n) {
-  return __gen_ocl_pow(2, n) * x;
-}
 INLINE_OVERLOADABLE float pown(float x, int n) {
   if (x == 0 && n == 0)
     return 1;
@@ -1700,64 +3487,85 @@ INLINE_OVERLOADABLE float rootn(float x, int n) {
 /////////////////////////////////////////////////////////////////////////////
 // Geometric functions (see 6.11.5 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
+INLINE_OVERLOADABLE float dot(float p0, float p1) {
+  return p0 * p1;
+}
 INLINE_OVERLOADABLE float dot(float2 p0, float2 p1) {
-  return mad(p0.x,p1.x,p0.y*p1.y);
+  return p0.x * p1.x + p0.y * p1.y;
 }
 INLINE_OVERLOADABLE float dot(float3 p0, float3 p1) {
-  return mad(p0.x,p1.x,mad(p0.z,p1.z,p0.y*p1.y));
+  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z;
 }
 INLINE_OVERLOADABLE float dot(float4 p0, float4 p1) {
-  return mad(p0.x,p1.x,mad(p0.w,p1.w,mad(p0.z,p1.z,p0.y*p1.y)));
+  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z + p0.w * p1.w;
 }
-
-INLINE_OVERLOADABLE float dot(float8 p0, float8 p1) {
-  return mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
-         mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))));
+INLINE_OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
+#define BODY \
+  if(m == 0) \
+    return 0; \
+  if(isinf(m)) \
+    return INFINITY; \
+  if(m < 1) \
+    m = 1; \
+  x /= m; \
+  return m * sqrt(dot(x,x));
+INLINE_OVERLOADABLE float length(float2 x) {
+  float m = max(__gen_ocl_fabs(x.s0), __gen_ocl_fabs(x.s1));
+  BODY;
 }
-INLINE_OVERLOADABLE float dot(float16 p0, float16 p1) {
-  return mad(p0.sc,p1.sc,mad(p0.sd,p1.sd,mad(p0.se,p1.se,mad(p0.sf,p1.sf,
-         mad(p0.s8,p1.s8,mad(p0.s9,p1.s9,mad(p0.sa,p1.sa,mad(p0.sb,p1.sb,
-         mad(p0.x,p1.x,mad(p0.s7,p1.s7, mad(p0.s6,p1.s6,mad(p0.s5,p1.s5,
-         mad(p0.s4,p1.s4,mad(p0.w,p1.w, mad(p0.z,p1.z,p0.y*p1.y)))))))))))))));
+INLINE_OVERLOADABLE float length(float3 x) {
+  float m = max(__gen_ocl_fabs(x.s0), max(__gen_ocl_fabs(x.s1), __gen_ocl_fabs(x.s2)));
+  BODY;
 }
-
-INLINE_OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
-INLINE_OVERLOADABLE float length(float2 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float length(float3 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float length(float4 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float length(float8 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float length(float16 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float length(float4 x) {
+  float m = max(__gen_ocl_fabs(x.s0), max(__gen_ocl_fabs(x.s1), max(__gen_ocl_fabs(x.s2), __gen_ocl_fabs(x.s3))));
+  BODY;
+}
+#undef BODY
 INLINE_OVERLOADABLE float distance(float x, float y) { return length(x-y); }
 INLINE_OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
 INLINE_OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
 INLINE_OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
-INLINE_OVERLOADABLE float distance(float8 x, float8 y) { return length(x-y); }
-INLINE_OVERLOADABLE float distance(float16 x, float16 y) { return length(x-y); }
-INLINE_OVERLOADABLE float normalize(float x) { return 1.f; }
-INLINE_OVERLOADABLE float2 normalize(float2 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float3 normalize(float3 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float4 normalize(float4 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float8 normalize(float8 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float16 normalize(float16 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float normalize(float x) {
+  union { float f; unsigned u; } u;
+  u.f = x;
+  if(u.u == 0)
+    return 0.f;
+  if(isnan(x))
+    return NAN;
+  return u.u < 0x7fffffff ? 1.f : -1.f;
+}
+INLINE_OVERLOADABLE float2 normalize(float2 x) {
+  float m = length(x);
+  if(m == 0)
+    return 0;
+  return x / m;
+}
+INLINE_OVERLOADABLE float3 normalize(float3 x) {
+  float m = length(x);
+  if(m == 0)
+    return 0;
+  return x / m;
+}
+INLINE_OVERLOADABLE float4 normalize(float4 x) {
+  float m = length(x);
+  if(m == 0)
+    return 0;
+  return x / m;
+}
 
 INLINE_OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }
 INLINE_OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }
 INLINE_OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }
 INLINE_OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float fast_length(float8 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float fast_length(float16 x) { return sqrt(dot(x,x)); }
 INLINE_OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }
 INLINE_OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }
 INLINE_OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }
 INLINE_OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }
-INLINE_OVERLOADABLE float fast_distance(float8 x, float8 y) { return length(x-y); }
-INLINE_OVERLOADABLE float fast_distance(float16 x, float16 y) { return length(x-y); }
-INLINE_OVERLOADABLE float fast_normalize(float x) { return 1.f; }
+INLINE_OVERLOADABLE float fast_normalize(float x) { return x > 0 ? 1.f : (x < 0 ? -1.f : 0.f); }
 INLINE_OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }
 INLINE_OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }
 INLINE_OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float8 fast_normalize(float8 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float16 fast_normalize(float16 x) { return x * rsqrt(dot(x, x)); }
 
 INLINE_OVERLOADABLE float3 cross(float3 v0, float3 v1) {
    return v0.yzx*v1.zxy-v0.zxy*v1.yzx;
@@ -1781,6 +3589,11 @@ INLINE_OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p)
   *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
 }
 
+#define DECL_UNTYPED_RD_SPACE_N(TYPE, DIM, SPACE) \
+INLINE_OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+}
+
 #define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
 INLINE_OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
   *(p + 3 * offset) = v.s0; \
@@ -1791,6 +3604,11 @@ INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
   return *(SPACE TYPE##3 *) (p + 3 * offset); \
 }
 
+#define DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+  return *(SPACE TYPE##3 *) (p + 3 * offset); \
+}
+
 #define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
   DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
   DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
@@ -1798,10 +3616,17 @@ INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
   DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
   DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
 
+#define DECL_UNTYPED_RD_ALL_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 2, SPACE) \
+  DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 4, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 8, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 16, SPACE)
+
 #define DECL_UNTYPED_RW_ALL(TYPE) \
   DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \
   DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \
-  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __constant) \
+  DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \
   DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)
 
 DECL_UNTYPED_RW_ALL(char)
@@ -1817,7 +3642,149 @@ DECL_UNTYPED_RW_ALL(double)
 
 #undef DECL_UNTYPED_RW_ALL
 #undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_UNTYPED_RD_ALL_SPACE
 #undef DECL_UNTYPED_RW_SPACE_N
+#undef DECL_UNTYPED_RD_SPACE_N
+#undef DECL_UNTYPED_V3_SPACE
+#undef DECL_UNTYPED_RDV3_SPACE
+
+PURE CONST float __gen_ocl_f16to32(short h);
+PURE CONST short __gen_ocl_f32to16(float f);
+
+INLINE_OVERLOADABLE short f32to16_rtp(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (f > con)
+    return s - signbit(f) * 2 + 1;
+  else
+    return s;
+}
+
+INLINE_OVERLOADABLE short f32to16_rtn(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (con > f)
+    return s + signbit(f) * 2 - 1;
+  else
+    return s;
+}
+
+INLINE_OVERLOADABLE short f32to16_rtz(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (((con > f) && !signbit(f)) ||
+      ((con < f) && signbit(f)))
+    return s - 1;
+  else
+    return s;
+}
+
+#define DECL_HALF_LD_SPACE(SPACE) \
+INLINE_OVERLOADABLE float vload_half(size_t offset, const SPACE half *p) { \
+  return __gen_ocl_f16to32(*(SPACE short *)(p + offset)); \
+} \
+INLINE_OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p) { \
+  return (float2)(vload_half(offset*2, p), \
+                  vload_half(offset*2 + 1, p)); \
+} \
+INLINE_OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p) { \
+  return (float3)(vload_half(offset*3, p), \
+                  vload_half(offset*3 + 1, p), \
+                  vload_half(offset*3 + 2, p)); \
+} \
+INLINE_OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p) { \
+  return (float3)(vload_half(offset*4, p), \
+                  vload_half(offset*4 + 1, p), \
+                  vload_half(offset*4 + 2, p)); \
+} \
+INLINE_OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p) { \
+  return (float4)(vload_half2(offset*2, p), \
+                  vload_half2(offset*2 + 1, p)); \
+} \
+INLINE_OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p) { \
+  return (float8)(vload_half4(offset*2, p), \
+                  vload_half4(offset*2 + 1, p)); \
+} \
+INLINE_OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p) { \
+  return (float16)(vload_half8(offset*2, p), \
+                   vload_half8(offset*2 + 1, p)); \
+}
+
+#define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \
+INLINE_OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p) { \
+  *(SPACE short *)(p + offset) = FUNC(data); \
+} \
+INLINE_OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data.lo, offset*2, p); \
+  vstore_half##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
+  vstore_half2##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data.s0, offset*3, p); \
+  vstore_half##ROUND(data.s1, offset*3 + 1, p); \
+  vstore_half##ROUND(data.s2, offset*3 + 2, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data.s0, offset*4, p); \
+  vstore_half##ROUND(data.s1, offset*4 + 1, p); \
+  vstore_half##ROUND(data.s2, offset*4 + 2, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
+  vstore_half2##ROUND(data.lo, offset*2, p); \
+  vstore_half2##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
+  vstore_half4##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
+  vstore_half4##ROUND(data.lo, offset*2, p); \
+  vstore_half4##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
+  vstore_half8##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
+  vstore_half8##ROUND(data.lo, offset*2, p); \
+  vstore_half8##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
+  vstore_half16##ROUND(data, offset, p); \
+}
+
+#define DECL_HALF_ST_SPACE(SPACE) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE,  , __gen_ocl_f32to16) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rte, __gen_ocl_f32to16) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtz, f32to16_rtz) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtp, f32to16_rtp) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtn, f32to16_rtn) \
+
+DECL_HALF_LD_SPACE(__global)
+DECL_HALF_LD_SPACE(__local)
+DECL_HALF_LD_SPACE(__constant)
+DECL_HALF_LD_SPACE(__private)
+
+DECL_HALF_ST_SPACE(__global)
+DECL_HALF_ST_SPACE(__local)
+DECL_HALF_ST_SPACE(__private)
+
+//#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_HALF_LD_SPACE
+#undef DECL_HALF_ST_SPACE
+#undef DECL_HALF_ST_SPACE_ROUND
+
+#define vloada_half vload_half
+#define vloada_half2 vload_half2
+#define vloada_half4 vload_half4
+#define vloada_half8 vload_half8
+#define vloada_half16 vload_half16
 
 // XXX workaround ptx profile
 #define fabs __gen_ocl_internal_fabs
@@ -1830,7 +3797,7 @@ DECL_UNTYPED_RW_ALL(double)
 #define log10 __gen_ocl_internal_log10
 #define exp __gen_ocl_internal_exp
 #define exp2 native_exp2
-#define exp10 native_exp10
+#define exp10 __gen_ocl_internal_exp10
 #define expm1 __gen_ocl_internal_expm1
 #define fmin __gen_ocl_internal_fmin
 #define fmax __gen_ocl_internal_fmax
@@ -1842,16 +3809,16 @@ DECL_UNTYPED_RW_ALL(double)
 /////////////////////////////////////////////////////////////////////////////
 // Miscellaneous Vector Functions (see 6.11.12 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
-#define DEC2(TYPE, XTYPE) \
-  INLINE_OVERLOADABLE TYPE##2 shuffle(XTYPE x, uint2 mask) { \
+#define DEC2(TYPE, XTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##2 shuffle(XTYPE x, MASKTYPE##2 mask) { \
     TYPE##2 y; \
     y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
     y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
     return y; \
   }
 
-#define DEC4(TYPE, XTYPE) \
-  INLINE_OVERLOADABLE TYPE##4 shuffle(XTYPE x, uint4 mask) { \
+#define DEC4(TYPE, XTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##4 shuffle(XTYPE x, MASKTYPE##4 mask) { \
     TYPE##4 y; \
     y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
     y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
@@ -1860,8 +3827,8 @@ DECL_UNTYPED_RW_ALL(double)
     return y; \
   }
 
-#define DEC8(TYPE, XTYPE) \
-  INLINE_OVERLOADABLE TYPE##8 shuffle(XTYPE x, uint8 mask) { \
+#define DEC8(TYPE, XTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##8 shuffle(XTYPE x, MASKTYPE##8 mask) { \
     TYPE##8 y; \
     y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
     y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
@@ -1874,8 +3841,8 @@ DECL_UNTYPED_RW_ALL(double)
     return y; \
   }
 
-#define DEC16(TYPE, XTYPE) \
-  INLINE_OVERLOADABLE TYPE##16 shuffle(XTYPE x, uint16 mask) { \
+#define DEC16(TYPE, XTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##16 shuffle(XTYPE x, MASKTYPE##16 mask) { \
     TYPE##16 y; \
     y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
     y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
@@ -1896,11 +3863,18 @@ DECL_UNTYPED_RW_ALL(double)
     return y; \
   }
 
+#define DEFMASK(TYPE, MASKTYPE) \
+  DEC2(TYPE, TYPE##2, MASKTYPE); DEC2(TYPE, TYPE##4, MASKTYPE); DEC2(TYPE, TYPE##8, MASKTYPE); DEC2(TYPE, TYPE##16, MASKTYPE) \
+  DEC4(TYPE, TYPE##2, MASKTYPE); DEC4(TYPE, TYPE##4, MASKTYPE); DEC4(TYPE, TYPE##8, MASKTYPE); DEC4(TYPE, TYPE##16, MASKTYPE) \
+  DEC8(TYPE, TYPE##2, MASKTYPE); DEC8(TYPE, TYPE##4, MASKTYPE); DEC8(TYPE, TYPE##8, MASKTYPE); DEC8(TYPE, TYPE##16, MASKTYPE) \
+  DEC16(TYPE, TYPE##2, MASKTYPE); DEC16(TYPE, TYPE##4, MASKTYPE); DEC16(TYPE, TYPE##8, MASKTYPE); DEC16(TYPE, TYPE##16, MASKTYPE)
+
 #define DEF(TYPE) \
-  DEC2(TYPE, TYPE##2); DEC2(TYPE, TYPE##4); DEC2(TYPE, TYPE##8); DEC2(TYPE, TYPE##16) \
-  DEC4(TYPE, TYPE##2); DEC4(TYPE, TYPE##4); DEC4(TYPE, TYPE##8); DEC4(TYPE, TYPE##16) \
-  DEC8(TYPE, TYPE##2); DEC8(TYPE, TYPE##4); DEC8(TYPE, TYPE##8); DEC8(TYPE, TYPE##16) \
-  DEC16(TYPE, TYPE##2); DEC16(TYPE, TYPE##4); DEC16(TYPE, TYPE##8); DEC16(TYPE, TYPE##16)
+  DEFMASK(TYPE, uchar) \
+  DEFMASK(TYPE, ushort) \
+  DEFMASK(TYPE, uint) \
+  DEFMASK(TYPE, ulong)
+
 DEF(char)
 DEF(uchar)
 DEF(short)
@@ -1911,31 +3885,32 @@ DEF(float)
 DEF(long)
 DEF(ulong)
 #undef DEF
+#undef DEFMASK
 #undef DEC2
 #undef DEC4
 #undef DEC8
 #undef DEC16
 
-#define DEC2(TYPE, ARGTYPE, TEMPTYPE) \
-  INLINE_OVERLOADABLE TYPE##2 shuffle2(ARGTYPE x, ARGTYPE y, uint2 mask) { \
+#define DEC2(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##2 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##2 mask) { \
     return shuffle((TEMPTYPE)(x, y), mask); \
   }
 
-#define DEC2X(TYPE) \
-  INLINE_OVERLOADABLE TYPE##2 shuffle2(TYPE##16 x, TYPE##16 y, uint2 mask) { \
+#define DEC2X(TYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##2 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##2 mask) { \
     TYPE##2 z; \
     z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
     z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
     return z; \
   }
 
-#define DEC4(TYPE, ARGTYPE, TEMPTYPE) \
-  INLINE_OVERLOADABLE TYPE##4 shuffle2(ARGTYPE x, ARGTYPE y, uint4 mask) { \
+#define DEC4(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##4 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##4 mask) { \
     return shuffle((TEMPTYPE)(x, y), mask); \
   }
 
-#define DEC4X(TYPE) \
-  INLINE_OVERLOADABLE TYPE##4 shuffle2(TYPE##16 x, TYPE##16 y, uint4 mask) { \
+#define DEC4X(TYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##4 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##4 mask) { \
     TYPE##4 z; \
     z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
     z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
@@ -1944,13 +3919,13 @@ DEF(ulong)
     return z; \
   }
 
-#define DEC8(TYPE, ARGTYPE, TEMPTYPE) \
-  INLINE_OVERLOADABLE TYPE##8 shuffle2(ARGTYPE x, ARGTYPE y, uint8 mask) { \
+#define DEC8(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##8 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##8 mask) { \
     return shuffle((TEMPTYPE)(x, y), mask); \
   }
 
-#define DEC8X(TYPE) \
-  INLINE_OVERLOADABLE TYPE##8 shuffle2(TYPE##16 x, TYPE##16 y, uint8 mask) { \
+#define DEC8X(TYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##8 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##8 mask) { \
     TYPE##8 z; \
     z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
     z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
@@ -1963,13 +3938,13 @@ DEF(ulong)
     return z; \
   }
 
-#define DEC16(TYPE, ARGTYPE, TEMPTYPE) \
-  INLINE_OVERLOADABLE TYPE##16 shuffle2(ARGTYPE x, ARGTYPE y, uint16 mask) { \
+#define DEC16(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##16 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##16 mask) { \
     return shuffle((TEMPTYPE)(x, y), mask); \
   }
 
-#define DEC16X(TYPE) \
-  INLINE_OVERLOADABLE TYPE##16 shuffle2(TYPE##16 x, TYPE##16 y, uint16 mask) { \
+#define DEC16X(TYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##16 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##16 mask) { \
     TYPE##16 z; \
     z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
     z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
@@ -1990,23 +3965,29 @@ DEF(ulong)
     return z; \
   }
 
+#define DEFMASK(TYPE, MASKTYPE) \
+  DEC2(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC2(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC2(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC2X(TYPE, MASKTYPE) \
+  DEC4(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC4(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC4(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC4X(TYPE, MASKTYPE) \
+  DEC8(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC8(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC8(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC8X(TYPE, MASKTYPE) \
+  DEC16(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC16(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC16(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC16X(TYPE, MASKTYPE)
+
 #define DEF(TYPE) \
-  DEC2(TYPE, TYPE##2, TYPE##4) \
-  DEC2(TYPE, TYPE##4, TYPE##8) \
-  DEC2(TYPE, TYPE##8, TYPE##16) \
-  DEC2X(TYPE) \
-  DEC4(TYPE, TYPE##2, TYPE##4) \
-  DEC4(TYPE, TYPE##4, TYPE##8) \
-  DEC4(TYPE, TYPE##8, TYPE##16) \
-  DEC4X(TYPE) \
-  DEC8(TYPE, TYPE##2, TYPE##4) \
-  DEC8(TYPE, TYPE##4, TYPE##8) \
-  DEC8(TYPE, TYPE##8, TYPE##16) \
-  DEC8X(TYPE) \
-  DEC16(TYPE, TYPE##2, TYPE##4) \
-  DEC16(TYPE, TYPE##4, TYPE##8) \
-  DEC16(TYPE, TYPE##8, TYPE##16) \
-  DEC16X(TYPE)
+  DEFMASK(TYPE, uchar) \
+  DEFMASK(TYPE, ushort) \
+  DEFMASK(TYPE, uint) \
+  DEFMASK(TYPE, ulong)
 
 DEF(char)
 DEF(uchar)
@@ -2018,6 +3999,7 @@ DEF(float)
 DEF(long)
 DEF(ulong)
 #undef DEF
+#undef DEFMASK
 #undef DEC2
 #undef DEC2X
 #undef DEC4
@@ -2037,14 +4019,7 @@ void __gen_ocl_barrier_global(void);
 void __gen_ocl_barrier_local_and_global(void);
 
 typedef uint cl_mem_fence_flags;
-INLINE void barrier(cl_mem_fence_flags flags) {
-  if (flags == (CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE))
-    __gen_ocl_barrier_local_and_global();
-  else if (flags == CLK_LOCAL_MEM_FENCE)
-    __gen_ocl_barrier_local();
-  else if (flags == CLK_GLOBAL_MEM_FENCE)
-    __gen_ocl_barrier_global();
-}
+void barrier(cl_mem_fence_flags flags);
 
 INLINE void mem_fence(cl_mem_fence_flags flags) {
 }
@@ -2259,19 +4234,19 @@ int __gen_ocl_force_simd16(void);
 // Image access functions
 /////////////////////////////////////////////////////////////////////////////
 
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
 
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, int u, int v, int w, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, uint sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
 
 OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
 OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, float u, float v, int4 color);
@@ -2291,7 +4266,7 @@ int __gen_ocl_get_image_height(uint surface_id);
 int __gen_ocl_get_image_channel_data_type(uint surface_id);
 int __gen_ocl_get_image_channel_order(uint surface_id);
 int __gen_ocl_get_image_depth(uint surface_id);
-ushort __gen_ocl_get_sampler_info(uint sampler_id);
+ushort __gen_ocl_get_sampler_info(sampler_t sampler);
 
 #define GET_IMAGE(cl_image, surface_id) \
     uint surface_id = (uint)cl_image
@@ -2411,7 +4386,7 @@ DECL_IMAGE(0, image2d_t, float4, f, 2)
       tmpCoord.s1 += -0x1p-9;                                   \
     if (tmpCoord.s2 < 0 && tmpCoord.s2 > -0x1p-20)              \
       tmpCoord.s2 += -0x1p-9;                                   \
-  } 
+  }
 
 DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 4)
 DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 4)
diff --git a/backend/src/update_blob_ocl_header.py b/backend/src/update_blob_ocl_header.py
index 197f16c..7d6907a 100755
--- a/backend/src/update_blob_ocl_header.py
+++ b/backend/src/update_blob_ocl_header.py
@@ -21,8 +21,8 @@ import sys
 import os
 
 if len(sys.argv) != 3:
-    print "Invalid argument {}".format(sys.argv)
-    print "use {} tmpl_file_name output_file_name".format(sys.argv[0])
+    print "Invalid argument {0}".format(sys.argv)
+    print "use {0} tmpl_file_name output_file_name".format(sys.argv[0])
     raise
 
 def safeUnlink(filename):
@@ -46,7 +46,7 @@ for tline in tmplFile:
     if matched_header == "":
         blob.write(tline)
         for header in header_segments:
-            if tline.strip() == '// ##BEGIN_{}##'.format(header.upper()) :
+            if tline.strip() == '// ##BEGIN_{0}##'.format(header.upper()) :
                 hFile = open(path + '/ocl_' + header + '.h', 'r')
                 lineNr = 0
                 for hline in hFile:
@@ -56,7 +56,7 @@ for tline in tmplFile:
                 hFile.close()
                 matched_header = header
     else:
-        if tline.strip() == '// ##END_{}##'.format(matched_header.upper()) :
+        if tline.strip() == '// ##END_{0}##'.format(matched_header.upper()) :
             blob.write(tline)
             matched_header = "";
 
diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
index 97b568b..7870c12 100644
--- a/docs/Beignet.mdwn
+++ b/docs/Beignet.mdwn
@@ -1,17 +1,80 @@
 Beignet
 =======
 
-Beignet is an open source implementaion of the OpenCL specification - a generic
+Beignet is an open source implementation of the OpenCL specification - a generic
 compute oriented API. This code base contains the code to run OpenCL programs on
-Intel GPUs which bsically defines and implements the OpenCL host functions
+Intel GPUs which basically defines and implements the OpenCL host functions
 required to initialize the device, create the command queues, the kernels and
 the programs and run them on the GPU. The code base also contains the compiler
 part of the stack which is included in `backend/`. For more specific information
 about the compiler, please refer to `backend/README.md`
 
-How to build
+Prerequisite
 ------------
 
+The project depends on the following external libaries:
+
+- Several X components (XLib, Xfixes, Xext)
+- libdrm libraries (libdrm and libdrm\_intel)
+- Various LLVM components
+- The compiler backend itself (libgbe)
+- Mesa git master version built with gbm enabled to support extension cl\_khr\_gl\_sharing.
+
+And if you want to work with the standard ICD libOpenCL.so, then you need
+two more packages (the following package name is for Ubuntu):
+
+- ocl-icd-dev
+- ocl-icd-libopencl1
+
+If you don't want to enable ICD, or your system doesn't have ICD OpenCL support,
+you can still link to the beignet OpenCL library. You can find the beignet/libcl.so
+in your system's library installation directories.
+
+Note that the compiler depends on LLVM (Low-Level Virtual Machine project).
+Right now, the code has been compiled with LLVM 3.3/3.4. It will not compile
+with anything older.
+
+[http://llvm.org/releases/](http://llvm.org/releases/)
+
+LLVM 3.3 , 3.4 and 3.5 are supported. Till now, the recommended LLVM version is 3.3.
+There are some severe OpenCL related regression in current clang 3.4/3.5 version.
+
+**Note about LLVM 3.4**
+
+* If you want to try Clang/LLVM 3.4, you need to disable terminfo:
+--disable-terminfo. It's a llvm 3.4 bug.
+
+**Note about LLVM 3.5**
+
+* If you want to try Clang/LLVM 3.5, you need to build the clang/llvm with cxx11 enabled:
+--enable-cxx11.
+
+**Note about OpenCV support**
+
+* We only fully tested the OpenCV 2.4 branch with beignet. And the pass rate is about 99%
+  for beignet 0.8.0. The preferred LLVM/Clang version is 3.3. One OpenCV patch is needed
+  to work with LLVM/clang, the patch is already submitted to the OpenCV upstream 2.4 repo
+  and is waiting for review: [pull request](https://github.com/Itseez/opencv/pull/2318).
+  Before it is merged, you need to apply that patch manually to OpenCV 2.4 branch.
+* As some OpenCL kerne (in OpenCV 2.4 OCL test suite) runs more than 10 seconds, it may
+  be reset by the kernel as the kernel has a GPU hangcheck mechanism. You can disable the
+  hangcheck by invoke the following command on Ubuntu system:
+
+  `# echo -n 0 > /sys/module/i915/parameters/enable_hangcheck`
+
+  But this command is a little bit dangerous, as if your kernel hang, then the gpu will hang
+  forever.
+* For the OpenCV 3.0 branch, the pass rate may a little bit lower than the 2.4 branch.
+
+Also note that the code was compiled on GCC 4.6, GCC 4.7 and GCC 4.8. Since the code uses
+really recent C++11 features, you may expect problems with older compilers. Last
+time I tried, the code breaks ICC 12 and Clang with internal compiler errors
+while compiling anonymous nested lambda functions.
+
+
+How to build and install
+------------------------
+
 The project uses CMake with three profiles:
 
 1. Debug (-g)
@@ -26,41 +89,28 @@ Basically, from the root directory of the project
 
 `> cmake ../ # to configure`
 
-Choose whatever you want for the build.
-
-Then press 'c' to configure and 'g' to generate the code.
+CMake will check the dependencies and will complain if it does not find them.
 
 `> make`
 
-The project depends on several external libraries:
-
-- Several X components (XLib, Xfixes, Xext)
-- libdrm libraries (libdrm and libdrm\_intel)
-- Various LLVM components
-- The compiler backend itself (libgbe)
-- Mesa git master version built with gbm enabled to support extension cl\_khr\_gl\_sharing.
-
-CMake will check the dependencies and will complain if it does not find them.
-
-The cmake will also build the backend project. Please refer to:
+The cmake will build the backend firstly. Please refer to:
 [[OpenCL Gen Backend|Beignet/Backend]] to get more dependencies.
 
 Once built, the run-time produces a shared object libcl.so which basically
 directly implements the OpenCL API. A set of tests are also produced. They may
 be found in `utests/`.
 
-Note that the compiler depends on LLVM (Low-Level Virtual Machine project).
-Right now, the code has been compiled with LLVM 3.1/3.2. It will not compile
-with any thing older.
-
-[http://llvm.org/releases/](http://llvm.org/releases/)
+Simply invoke:
+`> make install`
 
-LLVM 3.1,3.2,3.3 and 3.4 are supported.
+It installs the following three files to the beignet/ directory relatively to
+your library installation directory.
+- libcl.so
+- ocl\_stdlib.h, ocl\_stdlib.h.pch
+- beignet.bc
 
-Also note that the code was compiled on GCC 4.6 and GCC 4.7. Since the code uses
-really recent C++11 features, you may expect problems with older compilers. Last
-time I tried, the code breaks ICC 12 and Clang with internal compiler errors
-while compiling anonymous nested lambda functions.
+It installs the OCL icd vendor files to /etc/OpenCL/vendors, if the system support ICD.
+- intel-beignet.icd
 
 How to run
 ----------
@@ -70,8 +120,10 @@ this code also produces various tests to ensure the compiler and the run-time
 consistency. This small test framework uses a simple c++ registration system to
 register all the unit tests.
 
-You need to set the variable `OCL_KERNEL_PATH` to locate the OCL kernels. They
-are with the run-time in `./kernels`.
+You need to call setenv.sh in the utests/ directory to set some environment variables
+firstly as below:
+
+`> . setenv.sh`
 
 Then in `utests/`:
 
@@ -86,27 +138,38 @@ will only run `some_unit_test0` and `some_unit_test1` tests
 Supported Hardware
 ------------------
 
-The code was tested on IVB GT2 with ubuntu and fedora core distribution.
-Currently Only IVB is supported right now. Actually, the code was only run on IVB GT2. You
-may expect some issues with IVB GT1.
+The code was tested on IVB GT2 with ubuntu and fedora core distribution. The recommended
+kernel version is equal or newer than 3.11. Currently Only IVB is supported right now.
+Actually, the code was run on IVB GT2/GT1, and both system are well supported now.
 
 TODO
 ----
 
-The run-time is far from being complete. Most of the pieces have been put
-together to test and develop the OpenCL compiler. A partial list of things to
-do:
+Interns of the OpenCL 1.1 spec, beignet is quite complete now. We can pass almost
+all the piglit OpenCL test cases now. And the pass rate for the OpenCV test suite
+is also good. There are still some remains work items listed as below, most of them
+are extension support and performance related.
+
+- Performance tuning. Till now, the focus of beignet project is to implement all
+  the mandatory functions/features specified by the OpenCL spec. There are plenty
+  of things need to do for performance tuning. For example, the extreme slow software
+  based sin/cos/... math functions due to the native math instruction lack of necessary
+  precision. And all the code is inlined which will increase the icache miss rate
+  significantly. And many other things which are specified partially in
+  [[here|Beignet/Backend/TODO]]. We will focus on performance tuning after the version 0.8.
 
 - Complete cl\_khr\_gl\_sharing support. We lack of some APIs implementation such
   as clCreateFromGLBuffer,clCreateFromGLRenderbuffer,clGetGLObjectInfo... Currently,
-  the working APIs are clCreateFromGLTexture,clCreateFromGLTexture2D.
+  the working APIs are clCreateFromGLTexture,clCreateFromGLTexture2D. This work
+  highly depends on mesa support. It seems that mesa would not provide such type
+  of extensions, we may have to hack with mesa source code to support this extension.
 
 - Check that NDRangeKernels can be pushed into _different_ queues from several
   threads.
 
 - No state tracking at all. One batch buffer is created at each "draw call"
   (i.e. for each NDRangeKernels). This is really inefficient since some
-  expensive pipe controls are issued for each batch buffer
+  expensive pipe controls are issued for each batch buffer.
 
 - Valgrind reports some leaks in libdrm. It sounds like a false positive but it
   has to be checked. Idem for LLVM. There is one leak here to check.
@@ -117,12 +180,15 @@ does not comply with the standard or it is just missing)
 
 Project repository
 ------------------
-Right now, we host our project on fdo at: git://anongit.freedesktop.org/beignet.
+Right now, we host our project on fdo at:
+[http://cgit.freedesktop.org/beignet/](http://cgit.freedesktop.org/beignet/).
+And the intel 01.org:
+[https://01.org/beignet](https://01.org/beignet)
 
 The team
 --------
-This project was created by Ben Segovia when he was working for Intel. Now we
-have a team in China OTC graphics department continue to work on this project.
+Beignet project was created by Ben Segovia. Since 2013, Now we have a team in
+Intel China OTC graphics team continue to work on this project.
 The official contact for this project is: Zou Nanhai (<nanhai.zou at intel.com>).
 
 How to contribute
diff --git a/docs/Beignet/Backend/TODO.mdwn b/docs/Beignet/Backend/TODO.mdwn
index adc7fd2..7728d6a 100644
--- a/docs/Beignet/Backend/TODO.mdwn
+++ b/docs/Beignet/Backend/TODO.mdwn
@@ -1,14 +1,15 @@
 TODO
 ====
 
-The compiler is far from complete. Even if the skeleton is now done and should
-be solid, There are a _lot_ of things to do from trivial to complex.
+The compiler is quite complete now in terms of functionality. It could pass
+almos all of the piglit OCL test cases and the pass rate for the OpenCV test
+suite is also quite good now. But there are plenty of things to do for the
+final performance tuning.
 
 OpenCL standard library
 -----------------------
 
-Today we define the OpenCL API in header file `src/ocl_stdlib.h`. This file is
-from being complete.
+Today we define the OpenCL API in header file `src/ocl_stdlib.h`.
 
 By the way, one question remains: do we want to implement
 the high-precision functions as _inline_ functions or as external functions to
@@ -19,23 +20,36 @@ do both actually.
 LLVM front-end
 --------------
 
-The code is defined in `src/llvm`.  We used the PTX ABI and the OpenCL profile
+The code is defined in `src/llvm`.  We used the SPIR and the OpenCL profile
 to compile the code. Therefore, a good part of the job is already done. However,
 many things must be implemented:
 
-- Lowering down of various intrinsics like `llvm.memcpy`
-
 - Better resolving of the PHI functions. Today, we always generate MOV
   instructions at the end of each basic block . They can be easily optimized.
 
 - From LLVM 3.3, we use SPIR IR. We need to use the compiler defined type to
   represent sampler_t/image2d_t/image1d_t/....
 
+- Considering to use libclc in our project and avoid to use the PCH which is not
+  compatible for different clang versions. And may contribute what we have done in
+  the ocl_stdlib.h to libclc if possible.
+
+- Optimize math functions. If the native math instructions don't compy with the
+  OCL spec, we use pure software style to implement those math instructions which
+  is extremely slow, for example. The cos and sin for HD4000 platform are very slow.
+  For some applications which may not need such a high accurate results. We may
+  provide a mechanism to use native_xxx functions instead of the extremely slow
+  version.
+
 Gen IR
 ------
 
 The code is defined in `src/ir`. Main things to do are:
 
+- Implement those llvm.memset/llvm.memcpy more efficiently. Currently, we lower
+  them as normal memcpy at llvm module level and not considering the intrinsics
+  all have a constant data length.
+
 - Finishing the handling of function arguments (see the [[IR
   description|gen_ir]] for more details)
 
@@ -54,6 +68,11 @@ The code is defined in `src/ir`. Main things to do are:
   This will obviously impact both instruction selection and the register
   allocation.
 
+- Implement fast path for small local variables. When the kernel only defines
+  a small local array/variable, there will be a good chance to allocate the local
+  array/variable in register space rather than system memory. This will reduce a
+  lot of memory load/stroe from the system memory.
+
 Backend
 -------
 
@@ -64,7 +83,14 @@ The code is defined in `src/backend`. Main things to do are:
 - Implementing proper instruction selection. A "simple" tree matching algorithm
   should provide good results for Gen
 
-- Improving the instruction scheduling pass
+- Improving the instruction scheduling pass. The current scheduling code has some bugs,
+  we disable it by default currently. We need to fix them in the future.
+
+- Some instructions are introduced in the last code generation stage. We need to
+  introduce a pass after that to eliminate dead instruction or duplicate MOVs and
+  some instructions with zero operands.
+
+- leverage the structured if/endif for branching processing ?
 
 General plumbing
 ----------------
diff --git a/docs/Beignet/Backend/compiler_backend.mdwn b/docs/Beignet/Backend/compiler_backend.mdwn
index 32028b6..3c489b2 100644
--- a/docs/Beignet/Backend/compiler_backend.mdwn
+++ b/docs/Beignet/Backend/compiler_backend.mdwn
@@ -83,22 +83,25 @@ file is very flexible i.e. it can (almost) be freely partitioned. To handle this
 peculiarity, we simply implemented a free list based generic memory allocator as
 done with `RegisterFilePartitioner` in `src/backend/context.cpp`.
 
-We then simply implemented a linear scan allocator (see
-`gen_reg_allocation.cpp`). The spilling is not implemented and is still a work
-in progress. The thing is that spilling must be specifically handled with Gen.
-Indeed:
-
-1. Bad point. Spilling is expensive and require to assemble messages for it
+We provide two directions of memory allocation. From tail to head direction is
+used for normal register, and from head to tail is for the curbe payload register
+allocation.
 
-2. Good point. Gen is able to spill up to 256 _contiguous_ bytes in one message.
-This must be used for high performance spilling and this may require to reorder
-properly registers to spill.
+We then simply implemented a linear scan allocator (see
+`gen_reg_allocation.cpp`). The spilling is implemented in the same file. The
+heuristics we used is the register's end point. It always try to spill the
+register with largest liveness end point if possible. Although Gen support to
+spill 4 SIMD8 register at once, we only support one currently. Need to optimize
+it latter, at least for the vectors' spilling. Maybe a new pass in the backend
+to find opportunity to gatter more spilled register into one contiguous area
+is also worth to do. We also can consider the spill register's interval to
+do smarter scratch memory allocation to reduce scratch memory requirement.
 
 Instruction scheduling
 ----------------------
 
-Intra-basic block instruction scheduling is relatively simple. It is not
-implemented yet.
+Intra-basic block instruction scheduling is relatively simple. It is implemented
+but has known bug, we need further effort to fix it.
 
 Instruction encoding
 --------------------
@@ -108,3 +111,8 @@ This is mostly done in `src/backend/gen_context.cpp` and
 straightforward. We just forward the selection code using the physically
 allocated registers. There is nothing special here. Just boilerplate.
 
+There are plenty of huge macro instructions in the `gen_context.cpp` currently.
+Most of them are for the long/double support on a Gen platform which doesn't support
+long/double in the hardware level. We may need to clean up and move those non-hardware
+related functions into upper layer. Too many huge instruction which will totally
+make the register spilling and dead code elimination harder and inefficient.
diff --git a/docs/Beignet/Backend/gen_ir.mdwn b/docs/Beignet/Backend/gen_ir.mdwn
index ae24729..424e596 100644
--- a/docs/Beignet/Backend/gen_ir.mdwn
+++ b/docs/Beignet/Backend/gen_ir.mdwn
@@ -63,12 +63,12 @@ Very limited IR
 ---------------
 
 The other major question, in particular when you look similar stacks like NVidia
-PTX, is:
+SPIR, is:
 
 do we need to encode in the IR register modifiers (abs, negate...) and immediate
 registers (like in add.f x y 1.0)?
 
-Contrary to other IRs (PTX and even LLVM that both supports immediates), we also
+Contrary to other IRs (SPIR and even LLVM that both supports immediates), we also
 chose to have a very simply IR, much simpler than the final ISA, and to merge
 back what we need at the instruction selection pass. Since we need instruction
 selection, let us keep the IR simple.
@@ -123,8 +123,8 @@ LLVM backends
 
 Since we will need to do some significant work anyway, this leads us to choose a
 more hard-coded path with a in-house IR. Note that will not prevent us from
-implementing later a LLVM backend "by the book" as Nvidia does today with PTX
-(using a LLVM backend to do the LLVM IR -> PTX conversion)
+implementing later a LLVM backend "by the book" as Nvidia does today with SPIR
+(using a LLVM backend to do the LLVM IR -> SPIR conversion)
 
 
 SSA or no SSA
@@ -158,7 +158,7 @@ The IR is organized as follows:
 
 - Functions (defined in `src/ir/function.*pp`). They are basically the counter
   part of LLVM functions or OpenCL kernels. Note that function arguments are a
-  problem. We actually use the PTX ABI. Everything smaller than the machine word
+  problem. We actually use the SPIR ABI. Everything smaller than the machine word
   size (i.e. 32 bits for Gen) is passed by value with a register. Everything
   else which is bigger than is passed by pointer with a ByVal attribute.
   Note that requires some special treatment in the IR (see below) to make the
@@ -174,7 +174,7 @@ Function arguments and pushed constants
 ---------------------------------------
 
 Gen can push values into the register file i.e. some registers are preset when
-the kernel starts to run. As detailed previously, the PTX ABI is convenient
+the kernel starts to run. As detailed previously, the SPIR ABI is convenient
 since every argument is either one register or one pointer to load from or to
 store to.
 
diff --git a/intel-beignet.icd.in b/intel-beignet.icd.in
new file mode 100644
index 0000000..9b2e349
--- /dev/null
+++ b/intel-beignet.icd.in
@@ -0,0 +1 @@
+ at LIB_INSTALL_DIR@/beignet/libcl.so
diff --git a/kernels/builtin_exp.cl b/kernels/builtin_exp.cl
new file mode 100644
index 0000000..ecc1a3e
--- /dev/null
+++ b/kernels/builtin_exp.cl
@@ -0,0 +1,10 @@
+__kernel void builtin_exp(__global float *dst, __global float *src, __global int *max_func) {
+  int i = get_global_id(0);
+  float x = src[i];
+
+  dst[i * (*max_func) + 0] = exp(x);
+  dst[i * (*max_func) + 1] = exp2(x);
+  dst[i * (*max_func) + 2] = exp10(x);
+  dst[i * (*max_func) + 3] = expm1(x);
+  dst[i * (*max_func) + 4] = x;
+};
diff --git a/kernels/builtin_pow.cl b/kernels/builtin_pow.cl
new file mode 100644
index 0000000..17d753e
--- /dev/null
+++ b/kernels/builtin_pow.cl
@@ -0,0 +1,7 @@
+kernel void builtin_pow(global float *dst, global float *src1, global float *src2, global int *max_func) {
+
+  int i = get_global_id(0);
+  dst[i * (*max_func) + 0] = pow(src1[i], src2[i]);
+  dst[i * (*max_func) + 1] = src1[i];
+
+}
diff --git a/kernels/compiler_function_argument3.cl b/kernels/compiler_function_argument3.cl
new file mode 100644
index 0000000..9395cd7
--- /dev/null
+++ b/kernels/compiler_function_argument3.cl
@@ -0,0 +1,71 @@
+struct sfloat8 {
+    float a;
+    float b;
+    float c;
+    float d;
+    float e;
+    float f;
+    float g;
+    float h;
+};
+
+
+__kernel void compiler_function_argument3(
+struct sfloat8 f, __global struct sfloat8 *result)
+{
+  result[0].a = f.a;
+  result[0].b = 12.0f;
+  result[0].c = 12.0f;
+  result[0].d = 12.0f;
+  result[0].e = 12.0f;
+  result[0].f = 12.0f;
+  result[0].g = 12.0f;
+  result[0].h = f.a + f.h;
+
+  result[1].a = f.a;
+  result[1].b = 12.0f;
+  result[1].c = 12.0f;
+  result[1].d = 12.0f;
+  result[1].e = 12.0f;
+  result[1].f = 12.0f;
+  result[1].g = 12.0f;
+  result[1].h = f.a + f.h;
+
+  result[2].a = f.a;
+  result[2].b = 12.0f;
+  result[2].c = 12.0f;
+  result[2].d = 12.0f;
+  result[2].e = 12.0f;
+  result[2].f = 12.0f;
+  result[2].g = 12.0f;
+  result[2].h = f.a + f.h;
+
+  result[3].a = f.a;
+  result[3].b = 12.0f;
+  result[3].c = 12.0f;
+  result[3].d = 12.0f;
+  result[3].e = 12.0f;
+  result[3].f = 12.0f;
+  result[3].g = 12.0f;
+  result[3].h = f.a + f.h;
+
+  result[4].a = f.a;
+  result[4].b = 12.0f;
+  result[4].c = 12.0f;
+  result[4].d = 12.0f;
+  result[4].e = 12.0f;
+  result[4].f = 12.0f;
+  result[4].g = 12.0f;
+  result[4].h = f.a + f.h;
+
+  result[5].a = f.a;
+  result[5].b = 12.0f;
+  result[5].c = 12.0f;
+  result[5].d = 12.0f;
+  result[5].e = 12.0f;
+  result[5].f = 12.0f;
+  result[5].g = 12.0f;
+  result[5].h = f.a + f.h;
+
+  result[6] = result[0];
+}
diff --git a/kernels/compiler_global_constant.cl b/kernels/compiler_global_constant.cl
index 53e24b3..c0e23d1 100644
--- a/kernels/compiler_global_constant.cl
+++ b/kernels/compiler_global_constant.cl
@@ -1,5 +1,5 @@
 constant int m[3] = {71,72,73};
-constant int n = 1;
+const constant int n = 1;
 constant int o[3] = {3, 2, 1};
 
 constant int4 a= {1, 2, 3, 4};
diff --git a/kernels/compiler_long.cl b/kernels/compiler_long.cl
index 3087292..e69c5bf 100644
--- a/kernels/compiler_long.cl
+++ b/kernels/compiler_long.cl
@@ -1,7 +1,8 @@
-kernel void compiler_long(global long *src1, global long *src2, global long *dst) {
+kernel void compiler_long(global long *src1, global long *src2, global long *dst, long zero) {
   int i = get_global_id(0);
+
   if(i < 5)
-    dst[i] = src1[i] + src2[i];
+    dst[i] = src1[i] + src2[i] + src2[i]*zero;
   if(i > 5)
-    dst[i] = src1[i] - src2[i];
+    dst[i] = src1[i] - src2[i] - zero;
 }
diff --git a/kernels/compiler_menger_sponge_no_shadow.cl b/kernels/compiler_menger_sponge_no_shadow.cl
index 4de6c10..27b059a 100644
--- a/kernels/compiler_menger_sponge_no_shadow.cl
+++ b/kernels/compiler_menger_sponge_no_shadow.cl
@@ -15,7 +15,7 @@ typedef float4 vec4;
 
 // fmod is not like glsl mod!
 inline __attribute__((always_inline, overloadable))
-float glsl_mod(float x,float y) { return x-y*floor(x/y); }
+float glsl_mod(float x,float y) { return mad( -y, floor(x/y), x); }
 inline __attribute__((always_inline, overloadable))
 float2 glsl_mod(float2 a,float2 b) { return (float2)(glsl_mod(a.x,b.x), glsl_mod(a.y,b.y)); }
 inline __attribute__((always_inline, overloadable))
diff --git a/kernels/compiler_private_data_overflow.cl b/kernels/compiler_private_data_overflow.cl
new file mode 100644
index 0000000..d0f557d
--- /dev/null
+++ b/kernels/compiler_private_data_overflow.cl
@@ -0,0 +1,10 @@
+kernel void compiler_private_data_overflow( __global int4 *output )
+{
+	int4 data[65];
+	for( int i=0; i<65; ++i )
+	{
+		data[i] = (int4)i;
+	}
+	if( get_global_id(0) == 1 )
+		*output = data[0];
+}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1e28c6c..95ff56f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,9 +1,9 @@
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}
                     ${DRM_INCLUDE_PATH}
+                    ${DRM_INCLUDE_PATH}/../
                     ${CMAKE_CURRENT_SOURCE_DIR}/../backend/src/backend/
                     ${CMAKE_CURRENT_SOURCE_DIR}/../include
                     ${MESA_SOURCE_INCLUDES})
-
 macro (MakeKernelBinStr KERNEL_PATH KERNEL_FILES)
 foreach (KF ${KERNEL_FILES})
   set (input_file ${KERNEL_PATH}/${KF}.cl)
@@ -12,7 +12,7 @@ foreach (KF ${KERNEL_FILES})
   add_custom_command(
     OUTPUT ${output_file}
     COMMAND rm -rf ${output_file}
-    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater -s ${input_file} -o${output_file}
+    COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file}
     DEPENDS ${input_file} ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater)
 endforeach (KF)
 endmacro (MakeKernelBinStr)
@@ -39,6 +39,7 @@ set(OPENCL_SRC
     cl_command_queue.c
     cl_command_queue.h
     cl_command_queue_gen7.c
+    cl_thread.c
     cl_driver.h
     cl_driver.cpp
     cl_driver_defs.c
@@ -77,4 +78,4 @@ target_link_libraries(
                       ${DRM_LIBRARY}
                       ${OPENGL_LIBRARIES}
                       ${OPTIONAL_EGL_LIBRARY})
-install (TARGETS cl LIBRARY DESTINATION lib)
+install (TARGETS cl LIBRARY DESTINATION ${LIB_INSTALL_DIR}/beignet)
diff --git a/src/OCLConfig.h.in b/src/OCLConfig.h.in
index 8662584..71de4b3 100644
--- a/src/OCLConfig.h.in
+++ b/src/OCLConfig.h.in
@@ -1,5 +1,6 @@
 // the configured options and settings for LIBCL
 #define LIBCL_DRIVER_VERSION_MAJOR @LIBCL_DRIVER_VERSION_MAJOR@
 #define LIBCL_DRIVER_VERSION_MINOR @LIBCL_DRIVER_VERSION_MINOR@
+#define LIBCL_DRIVER_VERSION_PATCH @LIBCL_DRIVER_VERSION_PATCH@
 #define LIBCL_C_VERSION_MAJOR @LIBCL_C_VERSION_MAJOR@
 #define LIBCL_C_VERSION_MINOR @LIBCL_C_VERSION_MINOR@
diff --git a/src/cl_api.c b/src/cl_api.c
index 0e562ed..2a6f8ce 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -70,6 +70,13 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
   cl_event e;
   if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) {
     e = cl_event_new(queue->ctx, queue, type, event!=NULL);
+
+    /* if need profiling, add the submit timestamp here. */
+    if (e->type != CL_COMMAND_USER &&
+	    e->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+	cl_event_get_timestamp(e, CL_PROFILING_COMMAND_QUEUED);
+    }
+
     if(event != NULL)
       *event = e;
     if(status == CL_ENQUEUE_EXECUTE_DEFER) {
@@ -482,13 +489,17 @@ clCreateSubBuffer(cl_mem                buffer,
                   const void *          buffer_create_info,
                   cl_int *              errcode_ret)
 {
-#if 0
+  cl_mem mem = NULL;
   cl_int err = CL_SUCCESS;
-  CHECK_MEM (buffer);
-  NOT_IMPLEMENTED;
+
+  CHECK_MEM(buffer);
+
+  mem = cl_mem_new_sub_buffer(buffer, flags, buffer_create_type,
+                       buffer_create_info, &err);
 error:
-#endif
-  return NULL;
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
 }
 
 cl_mem
@@ -735,8 +746,25 @@ clGetSamplerInfo(cl_sampler       sampler,
                  void *           param_value,
                  size_t *         param_value_size_ret)
 {
-  NOT_IMPLEMENTED;
-  return 0;
+  cl_int err = CL_SUCCESS;
+  CHECK_SAMPLER (sampler);
+
+  if (param_name == CL_SAMPLER_REFERENCE_COUNT) {
+    FILL_GETINFO_RET (cl_uint, 1, (cl_uint*)&sampler->ref_n, CL_SUCCESS);
+  } else if (param_name == CL_SAMPLER_CONTEXT) {
+    FILL_GETINFO_RET (cl_context, 1, &sampler->ctx, CL_SUCCESS);
+  } else if (param_name == CL_SAMPLER_NORMALIZED_COORDS) {
+    FILL_GETINFO_RET (cl_bool, 1, &sampler->normalized_coords, CL_SUCCESS);
+  } else if (param_name == CL_SAMPLER_ADDRESSING_MODE) {
+    FILL_GETINFO_RET (cl_addressing_mode, 1, &sampler->address, CL_SUCCESS);
+  } else if (param_name == CL_SAMPLER_FILTER_MODE ) {
+    FILL_GETINFO_RET (cl_filter_mode, 1, &sampler->filter, CL_SUCCESS);
+  } else{
+    return CL_INVALID_VALUE;
+  }
+
+error:
+  return err;
 }
 
 cl_program
@@ -888,19 +916,31 @@ clGetProgramInfo(cl_program       program,
     FILL_GETINFO_RET (char, (strlen(program->source) + 1),
                    program->source, CL_SUCCESS);
   } else if (param_name == CL_PROGRAM_BINARY_SIZES) {
-    FILL_GETINFO_RET (size_t, 1, (&program->bin_sz), CL_SUCCESS);
+    if (program->binary == NULL) {
+      program->binary_sz = gbe_program_serialize_to_binary(program->opaque, &program->binary);
+    }
+
+    if (program->binary == NULL || program->binary_sz == 0) {
+      return CL_OUT_OF_RESOURCES;
+    }
+    FILL_GETINFO_RET (size_t, 1, (&program->binary_sz), CL_SUCCESS);
   } else if (param_name == CL_PROGRAM_BINARIES) {
+    if (param_value_size_ret)
+      *param_value_size_ret = sizeof(void*);
     if (!param_value)
       return CL_SUCCESS;
 
     /* param_value points to an array of n
        pointers allocated by the caller */
-    if (program->bin_sz > 0) {
-      memcpy(*((void **)param_value), program->bin, program->bin_sz);
-    } else {
-      memcpy(*((void **)param_value), ret_str, 1);
+    if (program->binary == NULL) {
+      program->binary_sz = gbe_program_serialize_to_binary(program->opaque, &program->binary);
+    }
+
+    if (program->binary == NULL || program->binary_sz == 0) {
+      return CL_OUT_OF_RESOURCES;
     }
 
+    memcpy(*((void **)param_value), program->binary, program->binary_sz);
     return CL_SUCCESS;
   } else {
     return CL_INVALID_VALUE;
@@ -942,8 +982,9 @@ clGetProgramBuildInfo(cl_program             program,
 
     FILL_GETINFO_RET (char, (strlen(ret_str)+1), ret_str, CL_SUCCESS);
   } else if (param_name == CL_PROGRAM_BUILD_LOG) {
-    // TODO: need to add logs in backend when compiling.
-    FILL_GETINFO_RET (char, (strlen(ret_str)+1), ret_str, CL_SUCCESS);
+    FILL_GETINFO_RET (char, program->build_log_sz + 1, program->build_log, CL_SUCCESS);
+    if (param_value_size_ret)
+      *param_value_size_ret = program->build_log_sz + 1;
   } else {
     return CL_INVALID_VALUE;
   }
@@ -961,7 +1002,7 @@ clCreateKernel(cl_program   program,
   cl_int err = CL_SUCCESS;
 
   CHECK_PROGRAM (program);
-  if (program->is_built == CL_FALSE) {
+  if (program->ker_n <= 0) {
     err = CL_INVALID_PROGRAM_EXECUTABLE;
     goto error;
   }
@@ -983,7 +1024,7 @@ clCreateKernelsInProgram(cl_program      program,
   cl_int err = CL_SUCCESS;
 
   CHECK_PROGRAM (program);
-  if (program->is_built == CL_FALSE) {
+  if (program->ker_n <= 0) {
     err = CL_INVALID_PROGRAM_EXECUTABLE;
     goto error;
   }
@@ -1075,7 +1116,8 @@ clGetKernelWorkGroupInfo(cl_kernel                   kernel,
                          void *                      param_value,
                          size_t *                    param_value_size_ret)
 {
-  return cl_get_kernel_workgroup_info(device,
+  return cl_get_kernel_workgroup_info(kernel,
+                                      device,
                                       param_name,
                                       param_value_size,
                                       param_value,
@@ -1228,23 +1270,32 @@ clGetEventProfilingInfo(cl_event             event,
 
   CHECK_EVENT(event);
 
-  if (!(event->queue->props & CL_QUEUE_PROFILING_ENABLE) ||
-          event->type == CL_COMMAND_USER ||
+  if (event->type == CL_COMMAND_USER ||
+      !(event->queue->props & CL_QUEUE_PROFILING_ENABLE) ||
           event->status != CL_COMPLETE) {
     err = CL_PROFILING_INFO_NOT_AVAILABLE;
     goto error;
   }
 
-  if ((param_name != CL_PROFILING_COMMAND_QUEUED &&
-          param_name != CL_PROFILING_COMMAND_SUBMIT &&
-          param_name != CL_PROFILING_COMMAND_START &&
-          param_name != CL_PROFILING_COMMAND_END) ||
-          (param_value && param_value_size < sizeof(cl_ulong))) {
+  if (param_value && param_value_size < sizeof(cl_ulong)) {
     err = CL_INVALID_VALUE;
     goto error;
   }
 
-  err = cl_event_profiling(event, param_name, &ret_val);
+  if (param_name == CL_PROFILING_COMMAND_QUEUED) {
+    ret_val = event->timestamp[0];
+  } else if (param_name == CL_PROFILING_COMMAND_SUBMIT) {
+    ret_val = event->timestamp[1];
+  } else if (param_name == CL_PROFILING_COMMAND_START) {
+    err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_START);
+    ret_val = event->timestamp[2];
+  } else if (param_name == CL_PROFILING_COMMAND_END) {
+    err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_END);
+    ret_val = event->timestamp[3];
+  } else {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
 
   if (err == CL_SUCCESS) {
     if (param_value)
@@ -1319,7 +1370,7 @@ clEnqueueReadBuffer(cl_command_queue command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -1381,7 +1432,9 @@ clEnqueueReadBufferRect(cl_command_queue command_queue,
     goto error;
   }
 
-  if ((buffer_origin[2]+region[2])*buffer_slice_pitch + (buffer_origin[1]+region[1])*buffer_row_pitch + buffer_origin[0] + region[0] > buffer->size) {
+  if ((buffer_origin[2] + region[2] - 1) * buffer_slice_pitch
+         + (buffer_origin[1] + region[1] - 1) * buffer_row_pitch
+         + buffer_origin[0] + region[0] > buffer->size) {
     err = CL_INVALID_VALUE;
     goto error;
   }
@@ -1402,7 +1455,7 @@ clEnqueueReadBufferRect(cl_command_queue command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_READ_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -1452,7 +1505,7 @@ clEnqueueWriteBuffer(cl_command_queue    command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_WRITE_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -1514,7 +1567,9 @@ clEnqueueWriteBufferRect(cl_command_queue     command_queue,
     goto error;
   }
 
-  if ((buffer_origin[2]+region[2])*buffer_slice_pitch + (buffer_origin[1]+region[1])*buffer_row_pitch + buffer_origin[0] + region[0] > buffer->size) {
+  if ((buffer_origin[2] + region[2] - 1) * buffer_slice_pitch
+         + (buffer_origin[1] + region[1] - 1) * buffer_row_pitch
+         + buffer_origin[0] + region[0] > buffer->size) {
     err = CL_INVALID_VALUE;
     goto error;
   }
@@ -1535,7 +1590,7 @@ clEnqueueWriteBufferRect(cl_command_queue     command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_WRITE_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -1575,7 +1630,7 @@ clEnqueueCopyBuffer(cl_command_queue     command_queue,
     err = CL_INVALID_VALUE;
     goto error;
   }
-  if (dst_offset < 0 || dst_offset + cb > src_buffer->size) {
+  if (dst_offset < 0 || dst_offset + cb > dst_buffer->size) {
     err = CL_INVALID_VALUE;
     goto error;
   }
@@ -1588,7 +1643,22 @@ clEnqueueCopyBuffer(cl_command_queue     command_queue,
     goto error;
   }
 
-  // TODO: Need to check the sub buffer cases.
+  /* Check sub overlap */
+  if (src_buffer->type == CL_MEM_SUBBUFFER_TYPE && dst_buffer->type == CL_MEM_SUBBUFFER_TYPE ) {
+    struct _cl_mem_buffer* src_b = (struct _cl_mem_buffer*)src_buffer;
+    struct _cl_mem_buffer* dst_b = (struct _cl_mem_buffer*)dst_buffer;
+    size_t src_sub_offset = src_b->sub_offset;
+    size_t dst_sub_offset = dst_b->sub_offset;
+
+    if ((src_offset + src_sub_offset <= dst_offset + dst_sub_offset
+          && dst_offset + dst_sub_offset <= src_offset + src_sub_offset + cb - 1)
+     && (dst_offset + dst_sub_offset <= src_offset + src_sub_offset
+          && src_offset + src_sub_offset <= dst_offset + dst_sub_offset + cb - 1)) {
+      err = CL_MEM_COPY_OVERLAP;
+      goto error;
+    }
+  }
+
   err = cl_mem_copy(command_queue, src_buffer, dst_buffer, src_offset, dst_offset, cb);
 
   TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
@@ -1599,6 +1669,11 @@ clEnqueueCopyBuffer(cl_command_queue     command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_COPY_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
     err = cl_command_queue_flush(command_queue);
   }
   return 0;
@@ -1662,8 +1737,12 @@ clEnqueueCopyBufferRect(cl_command_queue     command_queue,
     goto error;
   }
 
-  if ((src_origin[2]+region[2])*src_slice_pitch + (src_origin[1]+region[1])*src_row_pitch + src_origin[0] + region[0] > src_buffer->size ||
-      (dst_origin[2]+region[2])*dst_slice_pitch + (dst_origin[1]+region[1])*dst_row_pitch + dst_origin[0] + region[0] > dst_buffer->size) {
+  if ((src_origin[2] + region[2] - 1) * src_slice_pitch
+        + (src_origin[1] + region[1] - 1) * src_row_pitch
+        + src_origin[0] + region[0] > src_buffer->size
+      ||(dst_origin[2] + region[2] - 1) * dst_slice_pitch
+          + (dst_origin[1] + region[1] - 1) * dst_row_pitch
+          + dst_origin[0] + region[0] > dst_buffer->size) {
     err = CL_INVALID_VALUE;
     goto error;
   }
@@ -1690,6 +1769,11 @@ clEnqueueCopyBufferRect(cl_command_queue     command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_COPY_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
     err = cl_command_queue_flush(command_queue);
   }
 
@@ -1768,7 +1852,7 @@ clEnqueueReadImage(cl_command_queue      command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_READ_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -1847,7 +1931,7 @@ clEnqueueWriteImage(cl_command_queue     command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_WRITE_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -1924,6 +2008,11 @@ clEnqueueCopyImage(cl_command_queue      command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_COPY_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
     err = cl_command_queue_flush(command_queue);
   }
 
@@ -1980,6 +2069,11 @@ clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_COPY_IMAGE_TO_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
     err = cl_command_queue_flush(command_queue);
   }
 
@@ -2036,6 +2130,11 @@ clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_COPY_BUFFER_TO_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
     err = cl_command_queue_flush(command_queue);
   }
 
@@ -2047,15 +2146,22 @@ static cl_int _cl_map_mem(cl_mem mem, void **ptr, void **mem_ptr, size_t offset,
 {
   cl_int slot = -1;
   int err = CL_SUCCESS;
+  size_t sub_offset = 0;
+
+  if(mem->type == CL_MEM_SUBBUFFER_TYPE) {
+    struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+    sub_offset = buffer->sub_offset;
+  }
+
   if (!(*ptr = cl_mem_map_gtt_unsync(mem))) {
     err = CL_MAP_FAILURE;
     goto error;
   }
-  *ptr = (char*)(*ptr) + offset;
+  *ptr = (char*)(*ptr) + offset + sub_offset;
   if(mem->flags & CL_MEM_USE_HOST_PTR) {
     assert(mem->host_ptr);
     //only calc ptr here, will do memcpy in enqueue
-    *mem_ptr = mem->host_ptr + offset;
+    *mem_ptr = mem->host_ptr + offset + sub_offset;
   } else {
     *mem_ptr = *ptr;
   }
@@ -2160,7 +2266,7 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_MAP_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -2256,7 +2362,7 @@ clEnqueueMapImage(cl_command_queue   command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_MAP_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -2293,7 +2399,7 @@ clEnqueueUnmapMemObject(cl_command_queue  command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_UNMAP_MEM_OBJECT) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
@@ -2336,7 +2442,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
 
   if (global_work_offset != NULL)
     for (i = 0; i < work_dim; ++i) {
-      if (UNLIKELY(~0LL - global_work_offset[i] > global_work_size[i])) {
+      if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) {
         err = CL_INVALID_GLOBAL_OFFSET;
         goto error;
       }
@@ -2357,14 +2463,29 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     goto error;
   }
 
+
   /* XXX No event right now */
   //FATAL_IF(num_events_in_wait_list > 0, "Events are not supported");
   //FATAL_IF(event_wait_list != NULL, "Events are not supported");
   //FATAL_IF(event != NULL, "Events are not supported");
 
-  if (local_work_size != NULL)
+  if (local_work_size != NULL) {
     for (i = 0; i < work_dim; ++i)
       fixed_local_sz[i] = local_work_size[i];
+  } else {
+    uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
+    for (i = 0; i< work_dim; i++) {
+      for (j = maxDimSize; j > 1; j--) {
+        if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
+          fixed_local_sz[i] = j;
+          maxGroupSize = maxGroupSize /j;
+          maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
+          break;  //choose next work_dim
+        }
+      }
+    }
+  }
+
   if (global_work_size != NULL)
     for (i = 0; i < work_dim; ++i)
       fixed_global_sz[i] = global_work_size[i];
@@ -2372,6 +2493,16 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     for (i = 0; i < work_dim; ++i)
       fixed_global_off[i] = global_work_offset[i];
 
+  if (kernel->compile_wg_sz[0] || kernel->compile_wg_sz[1] || kernel->compile_wg_sz[2]) {
+    if (fixed_local_sz[0] != kernel->compile_wg_sz[0]
+        || fixed_local_sz[1] != kernel->compile_wg_sz[1]
+        || fixed_local_sz[2] != kernel->compile_wg_sz[2])
+    {
+        err = CL_INVALID_WORK_GROUP_SIZE;
+        goto error;
+    }
+  }
+
   /* Do device specific checks are enqueue the kernel */
   err = cl_command_queue_ND_range(command_queue,
                                   kernel,
@@ -2388,6 +2519,11 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_NDRANGE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
     err = cl_command_queue_flush(command_queue);
   }
 
@@ -2467,7 +2603,7 @@ clEnqueueNativeKernel(cl_command_queue   command_queue,
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_NATIVE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
-    err = cl_enqueue_handle(data);
+    err = cl_enqueue_handle(event ? *event : NULL, data);
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   }
 
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 13789f6..4ac2e11 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -24,6 +24,7 @@
 #include "cl_device_id.h"
 #include "cl_mem.h"
 #include "cl_utils.h"
+#include "cl_thread.h"
 #include "cl_alloc.h"
 #include "cl_driver.h"
 #include "cl_khr_icd.h"
@@ -43,7 +44,9 @@ cl_command_queue_new(cl_context ctx)
   queue->magic = CL_MAGIC_QUEUE_HEADER;
   queue->ref_n = 1;
   queue->ctx = ctx;
-  TRY_ALLOC_NO_ERR (queue->gpgpu, cl_gpgpu_new(ctx->drv));
+  if ((queue->thread_data = cl_thread_data_create()) == NULL) {
+    goto error;
+  }
 
   /* Append the command queue in the list */
   pthread_mutex_lock(&ctx->queue_lock);
@@ -77,16 +80,18 @@ cl_command_queue_delete(cl_command_queue queue)
       queue->prev->next = queue->next;
     if (queue->next)
       queue->next->prev = queue->prev;
-    if (queue->next == NULL && queue->prev == NULL)
-      queue->ctx->queues = NULL;
+    if (queue->ctx->queues == queue)
+      queue->ctx->queues = queue->next;
   pthread_mutex_unlock(&queue->ctx->queue_lock);
   if (queue->fulsim_out != NULL) {
     cl_mem_delete(queue->fulsim_out);
     queue->fulsim_out = NULL;
   }
+
+  cl_thread_data_destroy(queue->thread_data);
+  queue->thread_data = NULL;
   cl_mem_delete(queue->perf);
   cl_context_delete(queue->ctx);
-  cl_gpgpu_delete(queue->gpgpu);
   cl_free(queue->wait_events);
   queue->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(queue);
@@ -119,13 +124,15 @@ LOCAL cl_int
 cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
 {
   uint32_t i;
+  GET_QUEUE_THREAD_GPGPU(queue);
+
   for (i = 0; i < k->image_sz; i++) {
     int id = k->images[i].arg_idx;
     struct _cl_mem_image *image;
     assert(gbe_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
     image = cl_mem_image(k->args[id].mem);
     set_image_info(k->curbe, &k->images[i], image);
-    cl_gpgpu_bind_image(queue->gpgpu, k->images[i].idx, image->base.bo, image->offset,
+    cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
                         image->intel_fmt, image->image_type,
                         image->w, image->h, image->depth,
                         image->row_pitch, image->tiling);
@@ -136,6 +143,8 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
 LOCAL cl_int
 cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
 {
+  GET_QUEUE_THREAD_GPGPU(queue);
+
   /* Bind all user buffers (given by clSetKernelArg) */
   uint32_t i;
   enum gbe_arg_type arg_type; /* kind of argument */
@@ -145,7 +154,12 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
     if (arg_type != GBE_ARG_GLOBAL_PTR || !k->args[i].mem)
       continue;
     offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
-    cl_gpgpu_bind_buf(queue->gpgpu, k->args[i].mem->bo, offset, cc_llc_l3);
+    if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
+      struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cc_llc_l3);
+    } else {
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cc_llc_l3);
+    }
   }
 
   return CL_SUCCESS;
@@ -402,14 +416,18 @@ error:
 LOCAL cl_int
 cl_command_queue_flush(cl_command_queue queue)
 {
-  cl_gpgpu_flush(queue->gpgpu);
+  GET_QUEUE_THREAD_GPGPU(queue);
+
+  cl_gpgpu_flush(gpgpu);
+
+  cl_invalid_thread_gpgpu(queue);
   return CL_SUCCESS;
 }
 
 LOCAL cl_int
 cl_command_queue_finish(cl_command_queue queue)
 {
-  cl_gpgpu_sync(queue->gpgpu);
+  cl_gpgpu_sync(cl_get_thread_batch_buf());
   return CL_SUCCESS;
 }
 
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 9396fd7..40c272c 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -22,6 +22,7 @@
 
 #include "cl_internals.h"
 #include "cl_driver.h"
+#include "cl_thread.h"
 #include "CL/cl.h"
 #include <stdint.h>
 
@@ -40,11 +41,17 @@ struct _cl_command_queue {
   cl_event  last_event;                /* The last event in the queue, for enqueue mark used */
   cl_command_queue_properties  props;  /* Queue properties */
   cl_command_queue prev, next;         /* We chain the command queues together */
-  cl_gpgpu gpgpu;                      /* Setup all GEN commands */
+  void *thread_data;                   /* Used to store thread context data */
   cl_mem perf;                         /* Where to put the perf counters */
   cl_mem fulsim_out;                   /* Fulsim will output this buffer */
 };
 
+/* The macro to get the thread specified gpgpu struct. */
+#define GET_QUEUE_THREAD_GPGPU(queue) \
+	cl_gpgpu gpgpu = queue ? cl_get_thread_gpgpu(queue) : NULL;  \
+	if (queue) \
+	  assert(gpgpu);
+
 /* Allocate and initialize a new command queue. Also insert it in the list of
  * command queue in the associated context
  */
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 65f8e17..ba69589 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -98,36 +98,47 @@ error:
 static void
 cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
 {
-  /* calculate constant buffer size */
+  /* calculate constant buffer size
+   * we need raw_size & aligned_size
+   */
+  GET_QUEUE_THREAD_GPGPU(queue);
   int32_t arg;
-  size_t offset;
+  size_t offset = 0;
+  uint32_t raw_size = 0, aligned_size =0;
   gbe_program prog = ker->program->opaque;
   const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
   size_t global_const_size = gbe_program_get_global_constant_size(prog);
-  uint32_t constant_buf_size = 0;
+  aligned_size = raw_size = global_const_size;
+  /* Reserve 8 bytes to get rid of 0 address */
+  if(global_const_size == 0) aligned_size = 8;
+
   for (arg = 0; arg < arg_n; ++arg) {
     const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
     if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+      uint32_t alignment = gbe_kernel_get_arg_align(ker->opaque, arg);
+      assert(alignment != 0);
       cl_mem mem = ker->args[arg].mem;
-      constant_buf_size += ALIGN(mem->size, 4);
+      raw_size += mem->size;
+      aligned_size = ALIGN(aligned_size, alignment);
+      aligned_size += mem->size;
     }
   }
-  if(global_const_size == 0 && constant_buf_size == 0)
+  if(raw_size == 0)
      return;
 
-  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(queue->gpgpu, constant_buf_size + global_const_size + 4);
+  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, aligned_size);
   cl_buffer_map(bo, 1);
   char * cst_addr = cl_buffer_get_virtual(bo);
-  offset = 0;
+
+  /* upload the global constant data */
   if (global_const_size > 0) {
-    /* Write the global constant arrays */
     gbe_program_get_global_constant_data(prog, (char*)(cst_addr+offset));
+    offset += global_const_size;
   }
-  offset += ALIGN(global_const_size, 4);
 
+  /* reserve 8 bytes to get rid of 0 address */
   if(global_const_size == 0) {
-    /* reserve 4 bytes to get rid of 0 address */
-    offset += 4;
+    offset = 8;
   }
 
   /* upload constant buffer argument */
@@ -136,7 +147,8 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
     const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
     if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
       cl_mem mem = ker->args[arg].mem;
-
+      uint32_t alignment = gbe_kernel_get_arg_align(ker->opaque, arg);
+      offset = ALIGN(offset, alignment);
       curbe_offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
       assert(curbe_offset >= 0);
       *(uint32_t *) (ker->curbe + curbe_offset) = offset;
@@ -145,7 +157,7 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
       void * addr = cl_buffer_get_virtual(mem->bo);
       memcpy(cst_addr + offset, addr, mem->size);
       cl_buffer_unmap(mem->bo);
-      offset += ALIGN(mem->size, 4);
+      offset += mem->size;
     }
   }
   cl_buffer_unmap(bo);
@@ -200,19 +212,22 @@ cl_curbe_fill(cl_kernel ker,
   }
   /* Handle the various offsets to SLM */
   const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque);
-  /* align so that we kernel argument get good alignment */
-  int32_t arg, slm_offset = ALIGN(gbe_kernel_get_slm_size(ker->opaque), 32);
+  int32_t arg, slm_offset = gbe_kernel_get_slm_size(ker->opaque);
+  ker->local_mem_sz = 0;
   for (arg = 0; arg < arg_n; ++arg) {
     const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg);
     if (type != GBE_ARG_LOCAL_PTR)
       continue;
+    uint32_t align = gbe_kernel_get_arg_align(ker->opaque, arg);
+    assert(align != 0);
+    slm_offset = ALIGN(slm_offset, align);
     offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
     assert(offset >= 0);
     uint32_t *slmptr = (uint32_t *) (ker->curbe + offset);
     *slmptr = slm_offset;
     slm_offset += ker->args[arg].local_sz;
+    ker->local_mem_sz += ker->args[arg].local_sz;
   }
-
   return slm_offset;
 }
 
@@ -221,7 +236,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
 {
   cl_context ctx = ker->program->ctx;
   cl_device_id device = ctx->device;
-  const int32_t per_lane_stack_sz = gbe_kernel_get_stack_size(ker->opaque);
+  const int32_t per_lane_stack_sz = ker->stack_size;
   const int32_t value = GBE_CURBE_EXTRA_ARGUMENT;
   const int32_t sub_value = GBE_STACK_BUFFER;
   const int32_t offset = gbe_kernel_get_curbe_offset(ker->opaque, value, sub_value);
@@ -256,8 +271,8 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
                                const size_t *global_wk_sz,
                                const size_t *local_wk_sz)
 {
+  GET_QUEUE_THREAD_GPGPU(queue);
   cl_context ctx = queue->ctx;
-  cl_gpgpu gpgpu = queue->gpgpu;
   char *final_curbe = NULL;  /* Includes them and one sub-buffer per group */
   cl_gpgpu_kernel kernel;
   const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
@@ -297,7 +312,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Bind user images */
   cl_command_queue_bind_image(queue, ker);
   /* Bind all samplers */
-  cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->sampler_sz);
+  cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
 
   cl_setup_scratch(gpgpu, ker);
   /* Bind a stack if needed */
@@ -321,6 +336,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Start a new batch buffer */
   batch_sz = cl_kernel_compute_batch_sz(ker);
   cl_gpgpu_batch_reset(gpgpu, batch_sz);
+  cl_set_thread_batch_buf(cl_gpgpu_ref_batch_buf(gpgpu));
   cl_gpgpu_batch_start(gpgpu);
 
   /* Issue the GPGPU_WALKER command */
diff --git a/src/cl_context.c b/src/cl_context.c
index b62e946..8190e6a 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -104,6 +104,7 @@ cl_context_properties_process(const cl_context_properties *prop,
     prop += 2;
     *prop_len += 2;
   }
+  (*prop_len)++;
 exit:
 error:
   return err;
@@ -202,6 +203,7 @@ cl_context_delete(cl_context ctx)
   assert(ctx->buffers == NULL);
   assert(ctx->drv);
   cl_free(ctx->prop_user);
+  cl_set_thread_batch_buf(NULL);
   cl_driver_delete(ctx->drv);
   ctx->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
   cl_free(ctx);
diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index e794739..9c18406 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -80,15 +80,67 @@
 #define IS_IVYBRIDGE(devid) (IS_IVB_GT1(devid) || IS_IVB_GT2(devid))
 #define IS_GEN7(devid)      IS_IVYBRIDGE(devid)
 
-#define PCI_CHIP_HASWELL_M0          0x0094
-#define PCI_CHIP_HASWELL_D0          0x0090
-#define PCI_CHIP_HASWELL_M           0x0091
-#define PCI_CHIP_HASWELL_L           0x0092
-
-#define IS_HASWELL(devid) ((devid) == PCI_CHIP_HASWELL_M0 || \
-                           (devid) == PCI_CHIP_HASWELL_D0 || \
-                           (devid) == PCI_CHIP_HASWELL_M  || \
-                           (devid) == PCI_CHIP_HASWELL_L)
+
+#define PCI_CHIP_HASWELL_D1          0x0402 /* GT1 desktop */
+#define PCI_CHIP_HASWELL_D2          0x0412 /* GT2 desktop */
+#define PCI_CHIP_HASWELL_D3          0x0422 /* GT3 desktop */
+#define PCI_CHIP_HASWELL_S1          0x040a /* GT1 server */
+#define PCI_CHIP_HASWELL_S2          0x041a /* GT2 server */
+#define PCI_CHIP_HASWELL_S3          0x042a /* GT3 server */
+#define PCI_CHIP_HASWELL_M1          0x0406 /* GT1 mobile */
+#define PCI_CHIP_HASWELL_M2          0x0416 /* GT2 mobile */
+#define PCI_CHIP_HASWELL_M3          0x0426 /* GT3 mobile */
+/* Software Development Vehicle devices. */
+#define PCI_CHIP_HASWELL_SDV_D1      0x0C02 /* SDV GT1 desktop */
+#define PCI_CHIP_HASWELL_SDV_D2      0x0C12 /* SDV GT2 desktop */
+#define PCI_CHIP_HASWELL_SDV_D3      0x0C22 /* SDV GT3 desktop */
+#define PCI_CHIP_HASWELL_SDV_S1      0x0C0A /* SDV GT1 server */
+#define PCI_CHIP_HASWELL_SDV_S2      0x0C1A /* SDV GT2 server */
+#define PCI_CHIP_HASWELL_SDV_S3      0x0C2A /* SDV GT3 server */
+#define PCI_CHIP_HASWELL_SDV_M1      0x0C06 /* SDV GT1 mobile */
+#define PCI_CHIP_HASWELL_SDV_M2      0x0C16 /* SDV GT2 mobile */
+#define PCI_CHIP_HASWELL_SDV_M3      0x0C26 /* SDV GT3 mobile */
+/* Ultrabooks */
+#define PCI_CHIP_HASWELL_ULT_D1      0x0A02 /* ULT GT1 desktop */
+#define PCI_CHIP_HASWELL_ULT_D2      0x0A12 /* ULT GT2 desktop */
+#define PCI_CHIP_HASWELL_ULT_D3      0x0A22 /* ULT GT3 desktop */
+#define PCI_CHIP_HASWELL_ULT_S1      0x0A0A /* ULT GT1 server */
+#define PCI_CHIP_HASWELL_ULT_S2      0x0A1A /* ULT GT2 server */
+#define PCI_CHIP_HASWELL_ULT_S3      0x0A2A /* ULT GT3 server */
+#define PCI_CHIP_HASWELL_ULT_M1      0x0A06 /* ULT GT1 mobile */
+#define PCI_CHIP_HASWELL_ULT_M2      0x0A16 /* ULT GT2 mobile */
+#define PCI_CHIP_HASWELL_ULT_M3      0x0A26 /* ULT GT3 mobile */
+/* CRW */
+#define PCI_CHIP_HASWELL_CRW_D1      0x0D02 /* CRW GT1 desktop */
+#define PCI_CHIP_HASWELL_CRW_D2      0x0D12 /* CRW GT2 desktop */
+#define PCI_CHIP_HASWELL_CRW_D3      0x0D22 /* CRW GT3 desktop */
+#define PCI_CHIP_HASWELL_CRW_S1      0x0D0A /* CRW GT1 server */
+#define PCI_CHIP_HASWELL_CRW_S2      0x0D1A /* CRW GT2 server */
+#define PCI_CHIP_HASWELL_CRW_S3      0x0D2A /* CRW GT3 server */
+#define PCI_CHIP_HASWELL_CRW_M1      0x0D06 /* CRW GT1 mobile */
+#define PCI_CHIP_HASWELL_CRW_M2      0x0D16 /* CRW GT2 mobile */
+#define PCI_CHIP_HASWELL_CRW_M3      0x0D26 /* CRW GT3 mobile */
+
+#define IS_HASWELL(devid) (  \
+	(devid) == PCI_CHIP_HASWELL_D1 || (devid) == PCI_CHIP_HASWELL_D2 || \
+	(devid) == PCI_CHIP_HASWELL_D3 || (devid) == PCI_CHIP_HASWELL_S1 || \
+	(devid) == PCI_CHIP_HASWELL_S2 || (devid) == PCI_CHIP_HASWELL_S3 || \
+	(devid) == PCI_CHIP_HASWELL_M1 || (devid) == PCI_CHIP_HASWELL_M2 || \
+	(devid) == PCI_CHIP_HASWELL_M3 || (devid) == PCI_CHIP_HASWELL_SDV_D1 || \
+	(devid) == PCI_CHIP_HASWELL_SDV_D2 || (devid) == PCI_CHIP_HASWELL_SDV_D3 || \
+	(devid) == PCI_CHIP_HASWELL_SDV_S1 || (devid) == PCI_CHIP_HASWELL_SDV_S2 || \
+	(devid) == PCI_CHIP_HASWELL_SDV_S3 || (devid) == PCI_CHIP_HASWELL_SDV_M1 || \
+	(devid) == PCI_CHIP_HASWELL_SDV_M2 || (devid) == PCI_CHIP_HASWELL_SDV_M3 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_D1 || (devid) == PCI_CHIP_HASWELL_ULT_D2 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_D3 || (devid) == PCI_CHIP_HASWELL_ULT_S1 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_S2 || (devid) == PCI_CHIP_HASWELL_ULT_S3 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_M1 || (devid) == PCI_CHIP_HASWELL_ULT_M2 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_M3 || (devid) == PCI_CHIP_HASWELL_CRW_D1 || \
+	(devid) == PCI_CHIP_HASWELL_CRW_D2 || (devid) == PCI_CHIP_HASWELL_CRW_D3 || \
+	(devid) == PCI_CHIP_HASWELL_CRW_S1 || (devid) == PCI_CHIP_HASWELL_CRW_S2 || \
+	(devid) == PCI_CHIP_HASWELL_CRW_S3 || (devid) == PCI_CHIP_HASWELL_CRW_M1 || \
+	(devid) == PCI_CHIP_HASWELL_CRW_M2 || (devid) == PCI_CHIP_HASWELL_CRW_M3)
+
 #define IS_GEN75(devid)  IS_HASWELL(devid)
 
 #endif /* __CL_DEVICE_DATA_H__ */
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 16b343d..0426738 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -24,6 +24,7 @@
 #include "cl_driver.h"
 #include "cl_device_data.h"
 #include "cl_khr_icd.h"
+#include "cl_thread.h"
 #include "CL/cl.h"
 
 #include <assert.h>
@@ -42,7 +43,6 @@ static struct _cl_device_id intel_ivb_gt2_device = {
   .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
   .wg_sz = 1024,
-  .compile_wg_sz = {0},	
 #include "cl_gen7_device.h"
 };
 
@@ -54,7 +54,6 @@ static struct _cl_device_id intel_ivb_gt1_device = {
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
   .wg_sz = 512,
-  .compile_wg_sz = {0},	
 #include "cl_gen7_device.h"
 };
 
@@ -67,7 +66,6 @@ static struct _cl_device_id intel_hsw_device = {
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
   .wg_sz = 512,
-  .compile_wg_sz = {0},	
 #include "cl_gen75_device.h"
 };
 
@@ -75,30 +73,120 @@ LOCAL cl_device_id
 cl_get_gt_device(void)
 {
   cl_device_id ret = NULL;
+  cl_set_thread_batch_buf(NULL);
   const int device_id = cl_driver_get_device_id();
 
-  /* XXX we pick IVB for HSW now */
-  if (device_id == PCI_CHIP_HASWELL_M   ||
-      device_id == PCI_CHIP_HASWELL_L   ||
-      device_id == PCI_CHIP_HASWELL_M0  ||
-      device_id == PCI_CHIP_HASWELL_D0) {
-    intel_hsw_device.vendor_id = device_id;
-    intel_hsw_device.platform = intel_platform;
-    ret = &intel_hsw_device;
-  }
-  else if (device_id == PCI_CHIP_IVYBRIDGE_GT1   ||
-           device_id == PCI_CHIP_IVYBRIDGE_M_GT1 ||
-           device_id == PCI_CHIP_IVYBRIDGE_S_GT1) {
-    intel_ivb_gt1_device.vendor_id = device_id;
-    intel_ivb_gt1_device.platform = intel_platform;
-    ret = &intel_ivb_gt1_device;
-  }
-  else if (device_id == PCI_CHIP_IVYBRIDGE_GT2   ||
-           device_id == PCI_CHIP_IVYBRIDGE_M_GT2) {
-    intel_ivb_gt2_device.vendor_id = device_id;
-    intel_ivb_gt2_device.platform = intel_platform;
-    ret = &intel_ivb_gt2_device;
+#define DECL_INFO_STRING(BREAK, STRUCT, FIELD, STRING) \
+    STRUCT.FIELD = STRING; \
+    STRUCT.JOIN(FIELD,_sz) = sizeof(STRING); \
+    goto BREAK;
+
+  switch (device_id) {
+    case PCI_CHIP_HASWELL_D1:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_D2:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_D3:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_S1:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_S2:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_S3:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_M1:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_M2:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_M3:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_SDV_D1:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_SDV_D2:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_SDV_D3:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_SDV_S1:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_SDV_S2:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_SDV_S3:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_SDV_M1:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_SDV_M2:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_SDV_M3:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_ULT_D1:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_ULT_D2:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_ULT_D3:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_ULT_S1:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_ULT_S2:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_ULT_S3:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_ULT_M1:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_ULT_M2:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_ULT_M3:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+	/* CRW */
+    case PCI_CHIP_HASWELL_CRW_D1:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_CRW_D2:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_CRW_D3:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_CRW_S1:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_CRW_S2:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_CRW_S3:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_CRW_M1:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_CRW_M2:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+    case PCI_CHIP_HASWELL_CRW_M3:
+      DECL_INFO_STRING(has_break, intel_hsw_device, name, "Intel(R) HD Graphics Haswell M");
+has_break:
+      intel_hsw_device.vendor_id = device_id;
+      intel_hsw_device.platform = intel_platform;
+      ret = &intel_hsw_device;
+      break;
+
+    case PCI_CHIP_IVYBRIDGE_GT1:
+      DECL_INFO_STRING(ivb_gt1_break, intel_ivb_gt1_device, name, "Intel(R) HD Graphics IvyBridge GT1");
+    case PCI_CHIP_IVYBRIDGE_M_GT1:
+      DECL_INFO_STRING(ivb_gt1_break, intel_ivb_gt1_device, name, "Intel(R) HD Graphics IvyBridge M GT1");
+    case PCI_CHIP_IVYBRIDGE_S_GT1:
+      DECL_INFO_STRING(ivb_gt1_break, intel_ivb_gt1_device, name, "Intel(R) HD Graphics IvyBridge S GT1");
+ivb_gt1_break:
+      intel_ivb_gt1_device.vendor_id = device_id;
+      intel_ivb_gt1_device.platform = intel_platform;
+      ret = &intel_ivb_gt1_device;
+      break;
+
+    case PCI_CHIP_IVYBRIDGE_GT2:
+      DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge GT2");
+    case PCI_CHIP_IVYBRIDGE_M_GT2:
+      DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge M GT2");
+ivb_gt2_break:
+      intel_ivb_gt2_device.vendor_id = device_id;
+      intel_ivb_gt2_device.platform = intel_platform;
+      ret = &intel_ivb_gt2_device;
+      break;
+    default:
+      printf("cl_get_gt_device(): error, unknown device\n");
+      exit(1);
   }
+
   return ret;
 }
 
@@ -262,34 +350,49 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
 }
 #undef DECL_FIELD
 
-#define DECL_FIELD(CASE,FIELD)                                      \
-  case JOIN(CL_KERNEL_,CASE):                                       \
-      if (param_value_size < sizeof(((cl_device_id)NULL)->FIELD))   \
-        return CL_INVALID_VALUE;                                    \
-      if (param_value_size_ret != NULL)                             \
-        *param_value_size_ret = sizeof(((cl_device_id)NULL)->FIELD);\
-      memcpy(param_value,                                           \
-             &device->FIELD,                                        \
-             sizeof(((cl_device_id)NULL)->FIELD));                  \
+#define _DECL_FIELD(FIELD)                                 \
+      if (param_value && param_value_size < sizeof(FIELD)) \
+        return CL_INVALID_VALUE;                           \
+      if (param_value_size_ret != NULL)                    \
+        *param_value_size_ret = sizeof(FIELD);             \
+      if (param_value)                                     \
+        memcpy(param_value, &FIELD, sizeof(FIELD));        \
         return CL_SUCCESS;
 
+#define DECL_FIELD(CASE,FIELD)                             \
+  case JOIN(CL_KERNEL_,CASE):                              \
+  _DECL_FIELD(FIELD)
+
+#include "cl_kernel.h"
 LOCAL cl_int
-cl_get_kernel_workgroup_info(cl_device_id device,
+cl_get_kernel_workgroup_info(cl_kernel kernel,
+                             cl_device_id device,
                              cl_kernel_work_group_info param_name,
                              size_t param_value_size,
                              void* param_value,
                              size_t* param_value_size_ret)
 {
+  int err = CL_SUCCESS;
   if (UNLIKELY(device != &intel_ivb_gt1_device &&
                device != &intel_ivb_gt2_device))
     return CL_INVALID_DEVICE;
-  if (UNLIKELY(param_value == NULL))
-    return CL_INVALID_VALUE;
 
+  CHECK_KERNEL(kernel);
   switch (param_name) {
-    DECL_FIELD(WORK_GROUP_SIZE, wg_sz)
-      DECL_FIELD(COMPILE_WORK_GROUP_SIZE, compile_wg_sz)
-    default: return CL_INVALID_VALUE;
+    DECL_FIELD(WORK_GROUP_SIZE, device->wg_sz)
+    DECL_FIELD(PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device->preferred_wg_sz_mul)
+    case CL_KERNEL_LOCAL_MEM_SIZE:
+      {
+        size_t local_mem_sz =  gbe_kernel_get_slm_size(kernel->opaque) + kernel->local_mem_sz;
+        _DECL_FIELD(local_mem_sz)
+      }
+    DECL_FIELD(COMPILE_WORK_GROUP_SIZE, kernel->compile_wg_sz)
+    DECL_FIELD(PRIVATE_MEM_SIZE, kernel->stack_size)
+    default:
+      return CL_INVALID_VALUE;
   };
+
+error:
+  return err;
 }
 
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index 1beff92..4ece26c 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -95,7 +95,7 @@ struct _cl_device_id {
   size_t built_in_kernels_sz;
   /* Kernel specific info that we're assigning statically */
   size_t wg_sz;
-  size_t compile_wg_sz[3];
+  size_t preferred_wg_sz_mul;
 };
 
 /* Get a device from the given platform */
@@ -115,11 +115,12 @@ extern cl_int cl_get_device_info(cl_device_id     device,
                                  void *           param_value,
                                  size_t *         param_value_size_ret);
 
-extern cl_int cl_get_kernel_workgroup_info(cl_device_id     device,
-                                 cl_kernel_work_group_info   param_name,
-                                 size_t           param_value_size,
-                                 void *           param_value,
-                                 size_t *         param_value_size_ret);
+extern cl_int cl_get_kernel_workgroup_info(cl_kernel kernel,
+                                           cl_device_id     device,
+                                           cl_kernel_work_group_info   param_name,
+                                           size_t           param_value_size,
+                                           void *           param_value,
+                                           size_t *         param_value_size_ret);
 /* Returns the Gen device ID */
 extern cl_int cl_device_get_version(cl_device_id device, cl_int *ver);
 
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 0e9b487..96fc377 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -24,7 +24,7 @@
 #include <stdlib.h>
 #include "cl_driver_type.h"
 /* Various limitations we should remove actually */
-#define GEN_MAX_SURFACES 128
+#define GEN_MAX_SURFACES 256
 #define GEN_MAX_SAMPLERS 16
 
 /**************************************************************************
@@ -95,11 +95,11 @@ typedef void (cl_gpgpu_delete_cb)(cl_gpgpu);
 extern cl_gpgpu_delete_cb *cl_gpgpu_delete;
 
 /* Synchonize GPU with CPU */
-typedef cl_gpgpu (cl_gpgpu_sync_cb)(cl_gpgpu);
+typedef void (cl_gpgpu_sync_cb)(void*);
 extern cl_gpgpu_sync_cb *cl_gpgpu_sync;
 
 /* Bind a regular unformatted buffer */
-typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t cchint);
+typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t internal_offset, uint32_t cchint);
 extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
 
 /* bind samplers defined in both kernel and kernel args. */
@@ -193,8 +193,20 @@ typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
 extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
 
 /* Get a event time stamp */
-typedef void (cl_gpgpu_event_get_timestamp_cb)(cl_gpgpu_event, int, uint64_t*);
-extern cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp;
+typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu_event, int, uint64_t*);
+extern cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp;
+
+/* Get current GPU time stamp */
+typedef void (cl_gpgpu_event_get_gpu_cur_timestamp_cb)(cl_gpgpu, uint64_t*);
+extern cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp;
+
+/* Get current batch buffer handle */
+typedef void* (cl_gpgpu_ref_batch_buf_cb)(cl_gpgpu);
+extern cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf;
+
+/* Get release batch buffer handle */
+typedef void (cl_gpgpu_unref_batch_buf_cb)(void*);
+extern cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf;
 
 /* Will spawn all threads */
 typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 54fa62e..0a9012c 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -80,5 +80,8 @@ LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
 LOCAL cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending = NULL;
 LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL;
 LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL;
-LOCAL cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp = NULL;
+LOCAL cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp = NULL;
+LOCAL cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp = NULL;
+LOCAL cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf = NULL;
+LOCAL cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf = NULL;
 
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 0330691..330d230 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -16,28 +16,34 @@
  *
  * Author: Rong Yang <rong.r.yang at intel.com>
  */
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <pthread.h>
 
 #include "cl_enqueue.h"
 #include "cl_image.h"
 #include "cl_driver.h"
+#include "cl_event.h"
+#include "cl_command_queue.h"
 #include "cl_utils.h"
 
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <pthread.h>
 
 cl_int cl_enqueue_read_buffer(enqueue_data* data)
 {
   cl_int err = CL_SUCCESS;
+  cl_mem mem = data->mem_obj;
+  assert(mem->type == CL_MEM_BUFFER_TYPE ||
+         mem->type == CL_MEM_SUBBUFFER_TYPE);
   void* src_ptr;
+  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
 
   if (!(src_ptr = cl_mem_map_auto(data->mem_obj))) {
     err = CL_MAP_FAILURE;
     goto error;
   }
 
-  memcpy(data->ptr, (char*)src_ptr + data->offset, data->size);
+  memcpy(data->ptr, (char*)src_ptr + data->offset + buffer->sub_offset, data->size);
 
   err = cl_mem_unmap_auto(data->mem_obj);
 
@@ -95,6 +101,10 @@ error:
 cl_int cl_enqueue_write_buffer(enqueue_data *data)
 {
   cl_int err = CL_SUCCESS;
+  cl_mem mem = data->mem_obj;
+  assert(mem->type == CL_MEM_BUFFER_TYPE ||
+         mem->type == CL_MEM_SUBBUFFER_TYPE);
+  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
   void* dst_ptr;
 
   if (!(dst_ptr = cl_mem_map_auto(data->mem_obj))) {
@@ -102,7 +112,7 @@ cl_int cl_enqueue_write_buffer(enqueue_data *data)
     goto error;
   }
 
-  memcpy((char*)dst_ptr + data->offset, data->const_ptr, data->size);
+  memcpy((char*)dst_ptr + data->offset + buffer->sub_offset, data->const_ptr, data->size);
 
   err = cl_mem_unmap_auto(data->mem_obj);
 
@@ -231,19 +241,23 @@ cl_int cl_enqueue_map_buffer(enqueue_data *data)
 {
   void *ptr = NULL;
   cl_int err = CL_SUCCESS;
-  cl_mem buffer = data->mem_obj;
+  cl_mem mem = data->mem_obj;
+  assert(mem->type == CL_MEM_BUFFER_TYPE ||
+         mem->type == CL_MEM_SUBBUFFER_TYPE);
+  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+
   //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
-  if (!(ptr = cl_mem_map_gtt(buffer))) {
+  if (!(ptr = cl_mem_map_gtt(mem))) {
     err = CL_MAP_FAILURE;
     goto error;
   }
 
-  ptr = (char*)ptr + data->offset;
+  ptr = (char*)ptr + data->offset + buffer->sub_offset;
   assert(data->ptr == ptr);
 
-  if(buffer->flags & CL_MEM_USE_HOST_PTR) {
-    assert(buffer->host_ptr);
-    memcpy(buffer->host_ptr + data->offset, ptr, data->size);
+  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+    assert(mem->host_ptr);
+    memcpy(mem->host_ptr + data->offset, ptr, data->size);
   }
 
 error:
@@ -364,8 +378,15 @@ cl_int cl_enqueue_native_kernel(enqueue_data *data)
 error:
   return err;
 }
-cl_int cl_enqueue_handle(enqueue_data* data)
+
+cl_int cl_enqueue_handle(cl_event event, enqueue_data* data)
 {
+  /* if need profiling, add the submit timestamp here. */
+  if (event && event->type != CL_COMMAND_USER
+           && event->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+    cl_event_get_timestamp(event, CL_PROFILING_COMMAND_SUBMIT);
+  }
+
   switch(data->type) {
     case EnqueueReadBuffer:
       return cl_enqueue_read_buffer(data);
@@ -386,6 +407,7 @@ cl_int cl_enqueue_handle(enqueue_data* data)
     case EnqueueUnmapMemObject:
       return cl_enqueue_unmap_mem_object(data);
     case EnqueueCopyBufferRect:
+    case EnqueueCopyBuffer:
     case EnqueueCopyImage:
     case EnqueueCopyBufferToImage:
     case EnqueueCopyImageToBuffer:
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
index b412d58..1d3ae5f 100644
--- a/src/cl_enqueue.h
+++ b/src/cl_enqueue.h
@@ -64,5 +64,5 @@ typedef struct _enqueue_data {
 } enqueue_data;
 
 /* Do real enqueue commands */
-cl_int cl_enqueue_handle(enqueue_data* data);
+cl_int cl_enqueue_handle(cl_event event, enqueue_data* data);
 #endif /* __CL_ENQUEUE_H__ */
diff --git a/src/cl_event.c b/src/cl_event.c
index 212f1ee..f838a3a 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -48,6 +48,7 @@ cl_event_is_gpu_command_type(cl_command_type type)
 cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict)
 {
   cl_event event = NULL;
+  GET_QUEUE_THREAD_GPGPU(queue);
 
   /* Allocate and inialize the structure itself */
   TRY_ALLOC_NO_ERR (event, CALLOC(struct _cl_event));
@@ -75,7 +76,7 @@ cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type ty
   else {
     event->status = CL_QUEUED;
     if(cl_event_is_gpu_command_type(event->type))
-      event->gpgpu_event = cl_gpgpu_event_new(queue->gpgpu);
+      event->gpgpu_event = cl_gpgpu_event_new(gpgpu);
   }
   cl_event_add_ref(event);       //dec when complete
   event->user_cb = NULL;
@@ -124,12 +125,15 @@ void cl_event_delete(cl_event event)
   /* Remove it from the list */
   assert(event->ctx);
   pthread_mutex_lock(&event->ctx->event_lock);
-    if (event->prev)
-      event->prev->next = event->next;
-    if (event->next)
-      event->next->prev = event->prev;
-    if (event->prev == NULL && event->next == NULL)
-      event->ctx->events = NULL;
+
+  if (event->prev)
+    event->prev->next = event->next;
+  if (event->next)
+    event->next->prev = event->prev;
+  /* if this is the head, update head pointer ctx->events */
+  if (event->ctx->events == event)
+    event->ctx->events = event->next;
+
   pthread_mutex_unlock(&event->ctx->event_lock);
   cl_context_delete(event->ctx);
 
@@ -254,6 +258,7 @@ void cl_event_new_enqueue_callback(cl_event event,
   user_event *user_events, *u_ev;
   cl_command_queue queue = event->queue;
   cl_int i;
+  GET_QUEUE_THREAD_GPGPU(data->queue);
 
   /* Allocate and inialize the structure itself */
   TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
@@ -333,7 +338,7 @@ void cl_event_new_enqueue_callback(cl_event event,
     }
   }
   if(data->queue != NULL && event->gpgpu_event != NULL) {
-    cl_gpgpu_event_pending(data->queue->gpgpu, event->gpgpu_event);
+    cl_gpgpu_event_pending(gpgpu, event->gpgpu_event);
     data->ptr = (void *)event->gpgpu_event;
   }
   cb->data = *data;
@@ -375,7 +380,7 @@ void cl_event_set_status(cl_event event, cl_int status)
 
   if(status <= CL_COMPLETE) {
     if(event->enqueue_cb) {
-      cl_enqueue_handle(&event->enqueue_cb->data);
+      cl_enqueue_handle(event, &event->enqueue_cb->data);
       if(event->gpgpu_event)
         cl_gpgpu_event_update_status(event->gpgpu_event, 1);  //now set complet, need refine
       event->status = status;  //Change the event status after enqueue and befor unlock
@@ -491,22 +496,29 @@ cl_int cl_event_marker(cl_command_queue queue, cl_event* event)
   return CL_SUCCESS;
 }
 
-cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val)
+cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name)
 {
+  cl_ulong ret_val = 0;
+  GET_QUEUE_THREAD_GPGPU(event->queue);
+
   if (!event->gpgpu_event) {
-    /* Some event like read buffer do not need GPU involved, so
-       we just return all the profiling to 0 now. */
-    *ret_val = 0;
+    cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
+    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
     return CL_SUCCESS;
   }
 
-  if(param_name == CL_PROFILING_COMMAND_START ||
-     param_name == CL_PROFILING_COMMAND_QUEUED ||
-     param_name == CL_PROFILING_COMMAND_SUBMIT) {
-    cl_gpgpu_event_get_timestamp(event->gpgpu_event, 0, ret_val);
+  if(param_name == CL_PROFILING_COMMAND_SUBMIT ||
+         param_name == CL_PROFILING_COMMAND_QUEUED) {
+    cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
+    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
+    return CL_SUCCESS;
+  } else if(param_name == CL_PROFILING_COMMAND_START) {
+    cl_gpgpu_event_get_exec_timestamp(event->gpgpu_event, 0, &ret_val);
+    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
     return CL_SUCCESS;
   } else if (param_name == CL_PROFILING_COMMAND_END) {
-    cl_gpgpu_event_get_timestamp(event->gpgpu_event, 1, ret_val);
+    cl_gpgpu_event_get_exec_timestamp(event->gpgpu_event, 1, &ret_val);
+    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
     return CL_SUCCESS;
   } else {
     return CL_INVALID_VALUE;
diff --git a/src/cl_event.h b/src/cl_event.h
index 722486a..3c61110 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -68,6 +68,7 @@ struct _cl_event {
   enqueue_callback*  enqueue_cb;  /* This event's enqueue */
   enqueue_callback*  waits_head;  /* The head of enqueues list wait on this event */
   cl_bool            emplict;     /* Identify this event whether created by api emplict*/
+  cl_ulong           timestamp[4];/* The time stamps for profiling. */
 };
 
 /* Create a new event object */
@@ -91,6 +92,6 @@ void cl_event_update_status(cl_event);
 /* Create the marker event */
 cl_int cl_event_marker(cl_command_queue, cl_event*);
 /* Do the event profiling */
-cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val);
+cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name);
 #endif /* __CL_EVENT_H__ */
 
diff --git a/src/cl_gen75_device.h b/src/cl_gen75_device.h
index e72ab0b..7bf662e 100644
--- a/src/cl_gen75_device.h
+++ b/src/cl_gen75_device.h
@@ -19,7 +19,7 @@
 
 /* Common fields for both SNB devices (either GT1 or GT2)
  */
-.max_parameter_size = 256, 
+.max_parameter_size = 1024, 
 .global_mem_cache_line_size = 128, /* XXX */
 .global_mem_cache_size = 8 << 10, /* XXX */
 .local_mem_type = CL_GLOBAL,
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h
index 5886103..e198d6f 100644
--- a/src/cl_gen7_device.h
+++ b/src/cl_gen7_device.h
@@ -18,7 +18,7 @@
  */
 
 /* Common fields for both IVB devices (either GT1 or GT2) */
-.max_parameter_size = 256, 
+.max_parameter_size = 1024, 
 .global_mem_cache_line_size = 128, /* XXX */
 .global_mem_cache_size = 8 << 10, /* XXX */
 .local_mem_type = CL_GLOBAL,
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index 87c4a24..110988a 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -35,19 +35,20 @@
 .native_vector_width_float = 16,
 .native_vector_width_double = 16,
 .native_vector_width_half = 16,
+.preferred_wg_sz_mul = 16,
 .address_bits = 32,
 .max_mem_alloc_size = 128 * 1024 * 1024,
 .image_support = CL_TRUE,
-.max_read_image_args = 0,
-.max_write_image_args = 0,
+.max_read_image_args = 128,
+.max_write_image_args = 8,
 .image2d_max_width = 8192,
 .image2d_max_height = 8192,
 .image3d_max_width = 8192,
 .image3d_max_height = 8192,
 .image3d_max_depth = 2048,
-.max_samplers = 8,
-.mem_base_addr_align = sizeof(cl_uint) * 8,
-.min_data_type_align_size = sizeof(cl_uint),
+.max_samplers = 16,
+.mem_base_addr_align = sizeof(cl_long) * 16 * 8,
+.min_data_type_align_size = sizeof(cl_long) * 16,
 .single_fp_config = 0, /* XXX */
 .global_mem_cache_type = CL_READ_WRITE_CACHE,
 .global_mem_size = 128 * 1024 * 1024,
@@ -58,16 +59,16 @@
 .profiling_timer_resolution = 80, /* ns */
 .endian_little = CL_TRUE,
 .available = CL_TRUE,
-.compiler_available = CL_FALSE, /* XXX */
+.compiler_available = CL_TRUE,
 .execution_capabilities = CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL,
 .queue_properties = CL_QUEUE_PROFILING_ENABLE,
 .platform = NULL, /* == intel_platform (set when requested) */
 /* IEEE 754, XXX does IVB support CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT? */
-.single_fp_config = CL_FP_DENORM | CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST , /* IEEE 754. */
+.single_fp_config = CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST , /* IEEE 754. */
 
 #define DECL_INFO_STRING(FIELD, STRING) \
     .FIELD = STRING,                    \
-    .JOIN(FIELD,_sz) = sizeof(STRING) + 1,
+    .JOIN(FIELD,_sz) = sizeof(STRING),
 DECL_INFO_STRING(name, "Intel HD Graphics Family")
 DECL_INFO_STRING(vendor, "Intel")
 DECL_INFO_STRING(version, LIBCL_VERSION_STRING)
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 4ba1c11..6a0c8e6 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -44,7 +44,6 @@ cl_kernel_delete(cl_kernel k)
   if (atomic_dec(&k->ref_n) > 1) return;
   /* Release one reference on all bos we own */
   if (k->bo)       cl_buffer_unreference(k->bo);
-  if (k->const_bo) cl_buffer_unreference(k->const_bo);
   /* This will be true for kernels created by clCreateKernel */
   if (k->ref_its_program) cl_program_delete(k->program);
   /* Release the curbe if allocated */
@@ -106,8 +105,14 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   arg_type = gbe_kernel_get_arg_type(k->opaque, index);
   arg_sz = gbe_kernel_get_arg_size(k->opaque, index);
 
-  if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz))
-    return CL_INVALID_ARG_SIZE;
+  if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) {
+    if (arg_sz == 2 && arg_type == GBE_ARG_VALUE && sz == sizeof(cl_sampler)) {
+      /* FIXME, this is a workaround for the case when a kernel arg
+         defined a sampler_t but doesn't use it.*/
+      arg_type = GBE_ARG_SAMPLER;
+    } else
+      return CL_INVALID_ARG_SIZE;
+  }
 
   if(UNLIKELY(arg_type == GBE_ARG_LOCAL_PTR && sz == 0))
     return CL_INVALID_ARG_SIZE;
@@ -230,6 +235,8 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
   assert(k->sampler_sz <= GEN_MAX_SAMPLERS);
   if (k->sampler_sz > 0)
     gbe_kernel_get_sampler_data(k->opaque, k->samplers);
+  gbe_kernel_get_compile_wg_size(k->opaque, k->compile_wg_sz);
+  k->stack_size = gbe_kernel_get_stack_size(k->opaque);
   /* Get image data & size */
   k->image_sz = gbe_kernel_get_image_size(k->opaque);
   assert(k->sampler_sz <= GEN_MAX_SURFACES);
@@ -254,7 +261,6 @@ cl_kernel_dup(cl_kernel from)
   TRY_ALLOC_NO_ERR (to, CALLOC(struct _cl_kernel));
   SET_ICD(to->dispatch)
   to->bo = from->bo;
-  to->const_bo = from->const_bo;
   to->opaque = from->opaque;
   to->ref_n = 1;
   to->magic = CL_MAGIC_KERNEL_HEADER;
@@ -263,6 +269,8 @@ cl_kernel_dup(cl_kernel from)
   to->curbe_sz = from->curbe_sz;
   to->sampler_sz = from->sampler_sz;
   to->image_sz = from->image_sz;
+  memcpy(to->compile_wg_sz, from->compile_wg_sz, sizeof(from->compile_wg_sz));
+  to->stack_size = from->stack_size;
   if (to->sampler_sz)
     memcpy(to->samplers, from->samplers, to->sampler_sz * sizeof(uint32_t));
   if (to->image_sz) {
@@ -275,7 +283,6 @@ cl_kernel_dup(cl_kernel from)
 
   /* Retain the bos */
   if (from->bo)       cl_buffer_reference(from->bo);
-  if (from->const_bo) cl_buffer_reference(from->const_bo);
 
   /* We retain the program destruction since this kernel (user allocated)
    * depends on the program for some of its pointers
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index e191058..fb509a2 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -47,7 +47,6 @@ struct _cl_kernel {
   uint64_t magic;             /* To identify it as a kernel */
   volatile int ref_n;         /* We reference count this object */
   cl_buffer bo;               /* The code itself */
-  cl_buffer const_bo;         /* Buffer for all __constants values in the OCL program */
   cl_program program;         /* Owns this structure (and pointers) */
   gbe_kernel opaque;          /* (Opaque) compiler structure for the OCL kernel */
   char *curbe;                /* One curbe per kernel */
@@ -56,6 +55,10 @@ struct _cl_kernel {
   size_t sampler_sz;          /* sampler size defined in kernel & kernel args. */
   struct ImageInfo *images;   /* images defined in kernel args */
   size_t image_sz;            /* image count in kernel args */
+  cl_ulong local_mem_sz;      /* local memory size specified in kernel args. */
+  size_t compile_wg_sz[3];    /* Required workgroup size by __attribute__((reqd_work_gro
+                                 up_size(X, Y, Z))) qualifier.*/
+  size_t stack_size;          /* stack size per work item. */
   cl_argument *args;          /* To track argument setting */
   uint32_t arg_n:31;          /* Number of arguments */
   uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c
index d601134..cb5f5cd 100644
--- a/src/cl_khr_icd.c
+++ b/src/cl_khr_icd.c
@@ -14,7 +14,14 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  */
-
+#include <CL/cl.h>
+#ifndef CL_VERSION_1_2
+#include <cl_mem.h>
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_bitfield         cl_mem_migration_flags;
+#define cl_device_partition_property cl_device_partition_property_ext
+#define CL_API_SUFFIX__VERSION_1_2
+#endif
 #include <ocl_icd.h>
 
 #include "cl_platform_id.h"
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 5e70ef1..40e0a99 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -52,17 +52,14 @@ static cl_mem_object_type
 cl_get_mem_object_type(cl_mem mem)
 {
   switch (mem->type) {
-    case CL_MEM_BUFFER_TYPE: return CL_MEM_OBJECT_BUFFER;
+    case CL_MEM_BUFFER_TYPE:
+    case CL_MEM_SUBBUFFER_TYPE:
+      return CL_MEM_OBJECT_BUFFER;
     case CL_MEM_IMAGE_TYPE:
     case CL_MEM_GL_IMAGE_TYPE:
     {
       struct _cl_mem_image *image = cl_mem_image(mem);
-      if (image->depth == 1)
-        return CL_MEM_OBJECT_IMAGE1D;
-      else if (image->depth == 2)
-        return CL_MEM_OBJECT_IMAGE2D;
-      else if (image->depth == 3)
-        return CL_MEM_OBJECT_IMAGE3D;
+      return image->image_type;
     }
     default:
       return CL_MEM_OBJECT_BUFFER;
@@ -114,12 +111,21 @@ cl_get_mem_object_info(cl_mem mem,
   case CL_MEM_CONTEXT:
     *((cl_context *)param_value) = mem->ctx;
     break;
-  // TODO: Need to implement sub buffer first.
   case CL_MEM_ASSOCIATED_MEMOBJECT:
-    NOT_IMPLEMENTED;
+    if(mem->type != CL_MEM_SUBBUFFER_TYPE) {
+      *((cl_mem *)param_value) = NULL;
+    } else {
+      struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
+      *((cl_mem *)param_value) = (cl_mem)(buf->parent);
+    }
     break;
   case CL_MEM_OFFSET:
-    NOT_IMPLEMENTED;
+    if(mem->type != CL_MEM_SUBBUFFER_TYPE) {
+      *((size_t *)param_value) = 0;
+    } else {
+      struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
+      *((size_t *)param_value) = buf->sub_offset;
+    }
     break;
   }
 
@@ -194,22 +200,9 @@ cl_mem_allocate(enum cl_mem_type type,
   cl_mem mem = NULL;
   cl_int err = CL_SUCCESS;
   size_t alignment = 64;
-  cl_ulong max_mem_size;
 
   assert(ctx);
 
-  if ((err = cl_get_device_info(ctx->device,
-                                CL_DEVICE_MAX_MEM_ALLOC_SIZE,
-                                sizeof(max_mem_size),
-                                &max_mem_size,
-                                NULL)) != CL_SUCCESS) {
-    goto error;
-  }
-  if (UNLIKELY(sz > max_mem_size)) {
-    err = CL_INVALID_BUFFER_SIZE;
-    goto error;
-  }
-
   /* Allocate and inialize the structure itself */
   if (type == CL_MEM_IMAGE_TYPE) {
     struct _cl_mem_image *image = NULL;
@@ -282,11 +275,32 @@ cl_mem_new_buffer(cl_context ctx,
 
   cl_int err = CL_SUCCESS;
   cl_mem mem = NULL;
+  cl_ulong max_mem_size;
+
+  if (UNLIKELY(sz == 0)) {
+    err = CL_INVALID_BUFFER_SIZE;
+    goto error;
+  }
+
+  if (UNLIKELY(((flags & CL_MEM_READ_WRITE)
+                  && (flags & (CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY)))
+		      || ((flags & CL_MEM_READ_ONLY) && (flags & (CL_MEM_WRITE_ONLY)))
+              || ((flags & CL_MEM_ALLOC_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR))
+              || ((flags & CL_MEM_COPY_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR))
+              || ((flags & (~(CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY
+                        | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR
+                        | CL_MEM_USE_HOST_PTR))) != 0))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
 
   /* This flag is valid only if host_ptr is not NULL */
-  if (UNLIKELY((flags & CL_MEM_COPY_HOST_PTR ||
-                flags & CL_MEM_USE_HOST_PTR) &&
-                data == NULL)) {
+  if (UNLIKELY((((flags & CL_MEM_COPY_HOST_PTR) ||
+                (flags & CL_MEM_USE_HOST_PTR)) &&
+                data == NULL))
+               || (!(flags & (CL_MEM_COPY_HOST_PTR
+                            |CL_MEM_USE_HOST_PTR))
+                    && (data != NULL))) {
     err = CL_INVALID_HOST_PTR;
     goto error;
   }
@@ -307,6 +321,19 @@ cl_mem_new_buffer(cl_context ctx,
     goto error;
   }
 
+  if ((err = cl_get_device_info(ctx->device,
+                                CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                                sizeof(max_mem_size),
+                                &max_mem_size,
+                                NULL)) != CL_SUCCESS) {
+    goto error;
+  }
+
+  if (UNLIKELY(sz > max_mem_size)) {
+    err = CL_INVALID_BUFFER_SIZE;
+    goto error;
+  }
+
   /* Create the buffer in video memory */
   mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, &err);
   if (mem == NULL || err != CL_SUCCESS)
@@ -329,6 +356,102 @@ error:
   goto exit;
 }
 
+LOCAL cl_mem
+cl_mem_new_sub_buffer(cl_mem buffer,
+                      cl_mem_flags flags,
+                      cl_buffer_create_type create_type,
+                      const void *create_info,
+                      cl_int *errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+  struct _cl_mem_buffer *sub_buf = NULL;
+
+  if (buffer->type != CL_MEM_BUFFER_TYPE) {
+    err = CL_INVALID_MEM_OBJECT;
+    goto error;
+  }
+
+  if (flags && (((buffer->flags & CL_MEM_WRITE_ONLY) && (flags & (CL_MEM_READ_WRITE|CL_MEM_READ_ONLY)))
+          || ((buffer->flags & CL_MEM_READ_ONLY) && (flags & (CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY)))
+          || (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR)))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (create_type != CL_BUFFER_CREATE_TYPE_REGION) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!create_info) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  cl_buffer_region *info = (cl_buffer_region *)create_info;
+
+  if (!info->size) {
+    err = CL_INVALID_BUFFER_SIZE;
+    goto error;
+  }
+
+  if (info->origin > buffer->size || info->origin + info->size > buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (info->origin & (buffer->ctx->device->mem_base_addr_align - 1)) {
+    err = CL_MISALIGNED_SUB_BUFFER_OFFSET;
+    goto error;
+  }
+
+  /* Now create the sub buffer and link it to the buffer. */
+  TRY_ALLOC (sub_buf, CALLOC(struct _cl_mem_buffer));
+  mem = &sub_buf->base;
+  mem->type = CL_MEM_SUBBUFFER_TYPE;
+  SET_ICD(mem->dispatch)
+  mem->ref_n = 1;
+  mem->magic = CL_MAGIC_MEM_HEADER;
+  mem->flags = flags;
+  sub_buf->parent = (struct _cl_mem_buffer*)buffer;
+
+  cl_mem_add_ref(buffer);
+  /* Append the buffer in the parent buffer list */
+  pthread_mutex_lock(&((struct _cl_mem_buffer*)buffer)->sub_lock);
+  sub_buf->sub_next = ((struct _cl_mem_buffer*)buffer)->subs;
+  if (((struct _cl_mem_buffer*)buffer)->subs != NULL)
+    ((struct _cl_mem_buffer*)buffer)->subs->sub_prev = sub_buf;
+  ((struct _cl_mem_buffer*)buffer)->subs = sub_buf;
+  pthread_mutex_unlock(&((struct _cl_mem_buffer*)buffer)->sub_lock);
+
+  mem->bo = buffer->bo;
+  mem->size = info->size;
+  sub_buf->sub_offset = info->origin;
+  if (buffer->flags & CL_MEM_USE_HOST_PTR || buffer->flags & CL_MEM_COPY_HOST_PTR) {
+    mem->host_ptr = buffer->host_ptr;
+  }
+
+  cl_context_add_ref(buffer->ctx);
+  mem->ctx = buffer->ctx;
+  /* Append the buffer in the context buffer list */
+  pthread_mutex_lock(&buffer->ctx->buffer_lock);
+  mem->next = buffer->ctx->buffers;
+  if (buffer->ctx->buffers != NULL)
+    buffer->ctx->buffers->prev = mem;
+  buffer->ctx->buffers = mem;
+  pthread_mutex_unlock(&buffer->ctx->buffer_lock);
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+error:
+  cl_mem_delete(mem);
+  mem = NULL;
+  goto exit;
+}
+
 void
 cl_mem_copy_image_region(const size_t *origin, const size_t *region,
                          void *dst, size_t dst_row_pitch, size_t dst_slice_pitch,
@@ -473,6 +596,7 @@ _cl_mem_new_image(cl_context ctx,
   }
 
   sz = aligned_pitch * aligned_h * depth;
+
   mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
@@ -546,8 +670,6 @@ cl_mem_delete(cl_mem mem)
      cl_mem_gl_delete(cl_mem_gl_image(mem));
   }
 #endif
-  if (LIKELY(mem->bo != NULL))
-    cl_buffer_unreference(mem->bo);
 
   /* Remove it from the list */
   assert(mem->ctx);
@@ -556,8 +678,8 @@ cl_mem_delete(cl_mem mem)
       mem->prev->next = mem->next;
     if (mem->next)
       mem->next->prev = mem->prev;
-    if (mem->prev == NULL && mem->next == NULL)
-      mem->ctx->buffers = NULL;
+    if (mem->ctx->buffers == mem)
+      mem->ctx->buffers = mem->next;
   pthread_mutex_unlock(&mem->ctx->buffer_lock);
   cl_context_delete(mem->ctx);
 
@@ -586,6 +708,24 @@ cl_mem_delete(cl_mem mem)
     }
   }
 
+  /* Iff we are sub, do nothing for bo release. */
+  if (mem->type == CL_MEM_SUBBUFFER_TYPE) {
+    struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+    /* Remove it from the parent's list */
+    assert(buffer->parent);
+    pthread_mutex_lock(&buffer->parent->sub_lock);
+    if (buffer->sub_prev)
+      buffer->sub_prev->sub_next = buffer->sub_next;
+    if (buffer->sub_next)
+      buffer->sub_next->sub_prev = buffer->sub_prev;
+    if (buffer->parent->subs == buffer)
+      buffer->parent->subs = buffer->sub_next;
+    pthread_mutex_unlock(&buffer->parent->sub_lock);
+    cl_mem_delete((cl_mem )(buffer->parent));
+  } else if (LIKELY(mem->bo != NULL)) {
+    cl_buffer_unreference(mem->bo);
+  }
+
   cl_free(mem);
 }
 
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 75d5cf4..e325fa1 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -66,6 +66,7 @@ typedef struct _cl_mem_dstr_cb {
 /* Used for buffers and images */
 enum cl_mem_type {
   CL_MEM_BUFFER_TYPE,
+  CL_MEM_SUBBUFFER_TYPE,
   CL_MEM_IMAGE_TYPE,
   CL_MEM_GL_IMAGE_TYPE,
 };
@@ -137,7 +138,11 @@ cl_mem_image_init(struct _cl_mem_image *image, size_t w, size_t h,
 
 struct _cl_mem_buffer {
   _cl_mem base;
-  size_t offset;
+  struct _cl_mem_buffer* subs;         /* Sub buf objects. */
+  size_t sub_offset;                   /* The sub start offset. */
+  struct _cl_mem_buffer* sub_prev, *sub_next;/* We chain the sub memory buffers together */
+  pthread_mutex_t sub_lock;            /* Sub buffers list lock*/
+  struct _cl_mem_buffer* parent;       /* Point to the parent buffer if is sub-buffer */
 };
 
 inline static struct _cl_mem_image *
@@ -170,6 +175,9 @@ extern cl_int cl_get_image_info(cl_mem, cl_image_info, size_t, void *, size_t *)
 /* Create a new memory object and initialize it with possible user data */
 extern cl_mem cl_mem_new_buffer(cl_context, cl_mem_flags, size_t, void*, cl_int*);
 
+/* Create a new sub memory object */
+extern cl_mem cl_mem_new_sub_buffer(cl_mem, cl_mem_flags, cl_buffer_create_type, const void *, cl_int *);
+
 /* Idem but this is an image */
 extern cl_mem
 cl_mem_new_image(cl_context context,
diff --git a/src/cl_platform_id.h b/src/cl_platform_id.h
index 6b70aee..c7c716e 100644
--- a/src/cl_platform_id.h
+++ b/src/cl_platform_id.h
@@ -61,9 +61,10 @@ extern cl_int cl_get_platform_info(cl_platform_id    platform,
 
 #define _STR(x) #x
 #define _JOINT(x, y) _STR(x) "." _STR(y)
+#define _JOINT3(x, y, z) _STR(x) "." _STR(y) "." _STR(z)
 
 
-#define LIBCL_DRIVER_VERSION_STRING _JOINT(LIBCL_DRIVER_VERSION_MAJOR, LIBCL_DRIVER_VERSION_MINOR)
+#define LIBCL_DRIVER_VERSION_STRING _JOINT3(LIBCL_DRIVER_VERSION_MAJOR, LIBCL_DRIVER_VERSION_MINOR, LIBCL_DRIVER_VERSION_PATCH)
 #define LIBCL_VERSION_STRING "OpenCL " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR) " beignet " LIBCL_DRIVER_VERSION_STRING
 #define LIBCL_C_VERSION_STRING "OpenCL C " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR) " beignet " LIBCL_DRIVER_VERSION_STRING
 
diff --git a/src/cl_program.c b/src/cl_program.c
index 7ae8e8a..10eecee 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -79,8 +79,8 @@ cl_program_delete(cl_program p)
       p->prev->next = p->next;
     if (p->next)
       p->next->prev = p->prev;
-    if (p->prev == NULL && p->next == NULL)
-      p->ctx->programs = NULL;
+    if (p->ctx->programs == p)
+      p->ctx->programs = p->next;
   pthread_mutex_unlock(&p->ctx->program_lock);
 
   cl_free(p->bin);               /* Free the blob */
@@ -109,7 +109,9 @@ cl_program_new(cl_context ctx)
   p->ref_n = 1;
   p->magic = CL_MAGIC_PROGRAM_HEADER;
   p->ctx = ctx;
-
+  p->build_log = calloc(200, sizeof(char));
+  if (p->build_log)
+    p->build_log_max_sz = 200;
   /* The queue also belongs to its context */
   cl_context_add_ref(ctx);
 
@@ -223,7 +225,7 @@ cl_program_create_from_llvm(cl_context ctx,
   INVALID_VALUE_IF (file_name == NULL);
 
   program = cl_program_new(ctx);
-  program->opaque = gbe_program_new_from_llvm(file_name, 0, NULL, NULL);
+  program->opaque = gbe_program_new_from_llvm(file_name, program->build_log_max_sz, program->build_log, &program->build_log_sz, 1);
   if (UNLIKELY(program->opaque == NULL)) {
     err = CL_INVALID_PROGRAM;
     goto error;
@@ -300,20 +302,36 @@ cl_program_build(cl_program p, const char *options)
   int i = 0;
   int copyed = 0;
 
+  if (p->ref_n > 1)
+    return CL_INVALID_OPERATION;
+
   if (options) {
-    if(p->build_opts) {
-      cl_free(p->build_opts);
-      p->build_opts = NULL;
+    if(p->build_opts == NULL || strcmp(options, p->build_opts) != 0) {
+      if(p->build_opts) {
+        cl_free(p->build_opts);
+        p->build_opts = NULL;
+      }
+      TRY_ALLOC (p->build_opts, cl_calloc(strlen(options) + 1, sizeof(char)));
+      memcpy(p->build_opts, options, strlen(options));
+
+      p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
     }
+  }
 
-    TRY_ALLOC (p->build_opts, cl_calloc(strlen(options) + 1, sizeof(char)));
-    memcpy(p->build_opts, options, strlen(options));
+  if (options == NULL && p->build_opts) {
+    p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
+
+    cl_free(p->build_opts);
+    p->build_opts = NULL;
   }
 
   if (p->source_type == FROM_SOURCE) {
-    p->opaque = gbe_program_new_from_source(p->source, 0, options, NULL, NULL);
+    p->opaque = gbe_program_new_from_source(p->source, p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
     if (UNLIKELY(p->opaque == NULL)) {
-      err = CL_INVALID_PROGRAM;
+      if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
+        err = CL_INVALID_BUILD_OPTIONS;
+      else
+        err = CL_BUILD_PROGRAM_FAILURE;
       goto error;
     }
 
@@ -323,7 +341,7 @@ cl_program_build(cl_program p, const char *options)
   } else if (p->source_type == FROM_BINARY) {
     p->opaque = gbe_program_new_from_binary(p->binary, p->binary_sz);
     if (UNLIKELY(p->opaque == NULL)) {
-      err = CL_INVALID_PROGRAM;
+      err = CL_BUILD_PROGRAM_FAILURE;
       goto error;
     }
 
@@ -346,8 +364,8 @@ cl_program_build(cl_program p, const char *options)
     copyed += sz;
   }
 
-  p->is_built = 1;
 error:
+  p->is_built = 1;
   return err;
 }
 
@@ -397,7 +415,7 @@ cl_program_create_kernels_in_program(cl_program p, cl_kernel* ker)
   for (i = 0; i < p->ker_n; ++i) {
     TRY_ALLOC_NO_ERR(ker[i], cl_kernel_dup(p->ker[i]));
   }
-  
+
   return CL_SUCCESS;
 
 error:
diff --git a/src/cl_program.h b/src/cl_program.h
index 2cb547a..a6d75da 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -54,6 +54,9 @@ struct _cl_program {
   uint32_t source_type:2; /* Built from binary, source or LLVM */
   uint32_t is_built:1;    /* Did we call clBuildProgram on it? */
   char *build_opts;       /* The build options for this program */
+  size_t build_log_max_sz; /*build log maximum size in byte.*/
+  char *build_log;         /* The build log for this program. */
+  size_t build_log_sz;    /* The actual build log size.*/
 };
 
 /* Create a empty program */
diff --git a/src/cl_sampler.c b/src/cl_sampler.c
index b3f7045..d718256 100644
--- a/src/cl_sampler.c
+++ b/src/cl_sampler.c
@@ -30,8 +30,8 @@ uint32_t cl_to_clk(cl_bool normalized_coords,
                    cl_addressing_mode address,
                    cl_filter_mode filter)
 {
-  int clk_address;
-  int clk_filter;
+  int clk_address = CLK_ADDRESS_NONE;
+  int clk_filter = CLK_FILTER_NEAREST;
   switch (address) {
   case CL_ADDRESS_NONE: clk_address = CLK_ADDRESS_NONE; break;
   case CL_ADDRESS_CLAMP: clk_address = CLK_ADDRESS_CLAMP; break;
@@ -67,7 +67,7 @@ int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler)
      }
     }
   }
-  assert(0);
+  return -1;
 }
 
 LOCAL cl_sampler
@@ -125,8 +125,8 @@ cl_sampler_delete(cl_sampler sampler)
       sampler->prev->next = sampler->next;
     if (sampler->next)
       sampler->next->prev = sampler->prev;
-    if (sampler->prev == NULL && sampler->next == NULL)
-      sampler->ctx->samplers = NULL;
+    if (sampler->ctx->samplers == sampler)
+      sampler->ctx->samplers = sampler->next;
   pthread_mutex_unlock(&sampler->ctx->sampler_lock);
   cl_context_delete(sampler->ctx);
 
diff --git a/src/cl_thread.c b/src/cl_thread.c
new file mode 100644
index 0000000..cadc3cd
--- /dev/null
+++ b/src/cl_thread.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "cl_thread.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+
+static __thread void* thread_batch_buf = NULL;
+
+typedef struct _cl_thread_spec_data {
+  cl_gpgpu gpgpu ;
+  int valid;
+}cl_thread_spec_data;
+
+void cl_set_thread_batch_buf(void* buf) {
+  if (thread_batch_buf) {
+    cl_gpgpu_unref_batch_buf(thread_batch_buf);
+  }
+  thread_batch_buf = buf;
+}
+
+void* cl_get_thread_batch_buf(void) {
+  return thread_batch_buf;
+}
+
+cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
+{
+  pthread_key_t* key = queue->thread_data;
+  cl_thread_spec_data* thread_spec_data = pthread_getspecific(*key);
+
+  if (!thread_spec_data) {
+    TRY_ALLOC_NO_ERR(thread_spec_data, CALLOC(struct _cl_thread_spec_data));
+    if (pthread_setspecific(*key, thread_spec_data)) {
+      cl_free(thread_spec_data);
+      return NULL;
+    }
+  }
+
+  if (!thread_spec_data->valid) {
+    TRY_ALLOC_NO_ERR(thread_spec_data->gpgpu, cl_gpgpu_new(queue->ctx->drv));
+    thread_spec_data->valid = 1;
+  }
+
+error:
+  return thread_spec_data->gpgpu;
+}
+
+void cl_invalid_thread_gpgpu(cl_command_queue queue)
+{
+  pthread_key_t* key = queue->thread_data;
+  cl_thread_spec_data* thread_spec_data = pthread_getspecific(*key);
+
+  if (!thread_spec_data) {
+    return;
+  }
+
+  if (!thread_spec_data->valid) {
+    return;
+  }
+
+  assert(thread_spec_data->gpgpu);
+  cl_gpgpu_delete(thread_spec_data->gpgpu);
+  thread_spec_data->valid = 0;
+}
+
+static void thread_data_destructor(void *data) {
+  cl_thread_spec_data* thread_spec_data = (cl_thread_spec_data *)data;
+
+  if (thread_batch_buf) {
+    cl_gpgpu_unref_batch_buf(thread_batch_buf);
+    thread_batch_buf = NULL;
+  }
+
+  if (thread_spec_data->valid)
+    cl_gpgpu_delete(thread_spec_data->gpgpu);
+  cl_free(thread_spec_data);
+}
+
+/* Create the thread specific data. */
+void* cl_thread_data_create(void)
+{
+  int rc = 0;
+
+  pthread_key_t *thread_specific_key = CALLOC(pthread_key_t);
+  if (thread_specific_key == NULL)
+    return NULL;
+
+  rc = pthread_key_create(thread_specific_key, thread_data_destructor);
+
+  if (rc != 0)
+    return NULL;
+
+  return thread_specific_key;
+}
+
+/* The destructor for clean the thread specific data. */
+void cl_thread_data_destroy(void * data)
+{
+  pthread_key_t *thread_specific_key = (pthread_key_t *)data;
+
+  /* First release self spec data. */
+  cl_thread_spec_data* thread_spec_data =
+         pthread_getspecific(*thread_specific_key);
+  if (thread_spec_data && thread_spec_data->valid) {
+    cl_gpgpu_delete(thread_spec_data->gpgpu);
+    if (thread_spec_data)
+      cl_free(thread_spec_data);
+  }
+
+  pthread_key_delete(*thread_specific_key);
+  cl_free(thread_specific_key);
+}
diff --git a/src/cl_thread.h b/src/cl_thread.h
new file mode 100644
index 0000000..c8ab63c
--- /dev/null
+++ b/src/cl_thread.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __CL_THREAD_H__
+#define __CL_THREAD_H__
+
+#include <pthread.h>
+#include "cl_internals.h"
+#include "cl_command_queue.h"
+
+/* Create the thread specific data. */
+void* cl_thread_data_create(void);
+
+/* The destructor for clean the thread specific data. */
+void cl_thread_data_destroy(void * data);
+
+/* Used to get the gpgpu struct of each thread. */
+cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue);
+
+/* Used to release the gpgpu struct of each thread. */
+void cl_invalid_thread_gpgpu(cl_command_queue queue);
+
+/* Used to set the batch buffer of each thread. */
+void cl_set_thread_batch_buf(void* buf);
+
+/* Used to get the batch buffer of each thread. */
+void* cl_get_thread_batch_buf(void);
+
+#endif /* __CL_THREAD_H__ */
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index cfbb302..f88a105 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -204,7 +204,7 @@ intel_driver_open(intel_driver_t *intel, cl_context_prop props)
   }
 
   if(!intel_driver_is_active(intel)) {
-    printf("Trying to open directly...");
+    printf("Trying to open directly...\n");
     char card_name[20];
     for(cardi = 0; cardi < 16; cardi++) {
       sprintf(card_name, "/dev/dri/card%d", cardi);
@@ -276,7 +276,10 @@ intel_driver_init_master(intel_driver_t *driver, const char* dev_name)
 
   // usually dev_name = "/dev/dri/card%d"
   dev_fd = open(dev_name, O_RDWR);
-  if (dev_fd == -1) return 0;
+  if (dev_fd == -1) {
+    printf("open(\"%s\", O_RDWR) failed: %s\n", dev_name, strerror(errno));
+    return 0;
+  }
 
   // Check that we're authenticated and the only opener
   memset(&client, 0, sizeof(drm_client_t));
@@ -284,6 +287,7 @@ intel_driver_init_master(intel_driver_t *driver, const char* dev_name)
   assert (ret == 0);
 
   if (!client.auth) {
+    printf("%s not authenticated\n", dev_name);
     close(dev_fd);
     return 0;
   }
@@ -291,6 +295,7 @@ intel_driver_init_master(intel_driver_t *driver, const char* dev_name)
   client.idx = 1;
   ret = ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client);
   if (ret != -1 || errno != EINVAL) {
+    printf("%s is already in use\n", dev_name);
     close(dev_fd);
     return 0;
   }
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index b9bf2f9..b2d8bb0 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -51,6 +51,8 @@
 #define MO_RETAIN_BIT         (1 << 28)
 #define SAMPLER_STATE_SIZE    (16)
 
+#define TIMESTAMP_ADDR        0x2358
+
 /* Stores both binding tables and surface states */
 typedef struct surface_heap {
   uint32_t binding_table[256];
@@ -69,7 +71,7 @@ typedef struct intel_event {
 /* We can bind only a limited number of buffers */
 enum { max_buf_n = 128 };
 
-enum { max_img_n = 32 };
+enum { max_img_n = 128};
 
 enum {max_sampler_n = 16 };
 
@@ -80,6 +82,7 @@ struct intel_gpgpu
   intel_batchbuffer_t *batch;
   cl_gpgpu_kernel *ker;
   drm_intel_bo *binded_buf[max_buf_n];  /* all buffers binded for the call */
+  uint32_t target_buf_offset[max_buf_n];/* internal offset for buffers binded for the call */
   uint32_t binded_offset[max_buf_n];    /* their offsets in the curbe buffer */
   uint32_t binded_n;                    /* number of buffers binded */
 
@@ -114,10 +117,24 @@ typedef struct intel_gpgpu intel_gpgpu_t;
 
 
 static void
-intel_gpgpu_sync(intel_gpgpu_t *gpgpu)
+intel_gpgpu_sync(void *buf)
+{
+  if (buf)
+    drm_intel_bo_wait_rendering((drm_intel_bo *)buf);
+}
+
+static void *intel_gpgpu_ref_batch_buf(intel_gpgpu_t *gpgpu)
 {
   if (gpgpu->batch->last_bo)
-    drm_intel_bo_wait_rendering(gpgpu->batch->last_bo);
+    drm_intel_bo_reference(gpgpu->batch->last_bo);
+
+  return gpgpu->batch->last_bo;
+}
+
+static void intel_gpgpu_unref_batch_buf(void *buf)
+{
+  if (buf)
+    drm_intel_bo_unreference((drm_intel_bo *)buf);
 }
 
 static void
@@ -644,10 +661,12 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
 }
 
 static void
-intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset, uint32_t cchint)
+intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
+                     uint32_t internal_offset, uint32_t cchint)
 {
   assert(gpgpu->binded_n < max_buf_n);
   gpgpu->binded_buf[gpgpu->binded_n] = buf;
+  gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset;
   gpgpu->binded_offset[gpgpu->binded_n] = offset;
   gpgpu->binded_n++;
 }
@@ -674,7 +693,7 @@ intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint
 {
   drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
   gpgpu->stack_b.bo = drm_intel_bo_alloc(bufmgr, "STACK", size, 64);
-  intel_gpgpu_bind_buf(gpgpu, gpgpu->stack_b.bo, offset, cchint);
+  intel_gpgpu_bind_buf(gpgpu, gpgpu->stack_b.bo, offset, 0, cchint);
 }
 
 static void
@@ -768,11 +787,11 @@ intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
   /* Now put all the relocations for our flat address space */
   for (i = 0; i < k->thread_n; ++i)
     for (j = 0; j < gpgpu->binded_n; ++j) {
-      *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset;
+      *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset + gpgpu->target_buf_offset[j];
       drm_intel_bo_emit_reloc(gpgpu->curbe_b.bo,
                               gpgpu->binded_offset[j]+i*k->curbe_sz,
                               gpgpu->binded_buf[j],
-                              0,
+                              gpgpu->target_buf_offset[j],
                               I915_GEM_DOMAIN_RENDER,
                               I915_GEM_DOMAIN_RENDER);
     }
@@ -872,7 +891,7 @@ intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sample
 {
   int index;
 #ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
-  assert(sampler_sz <= GEN_MAX_SAMPLERS/2);
+  //assert(sampler_sz <= GEN_MAX_SAMPLERS/2);
 #else
   assert(sampler_sz <= GEN_MAX_SAMPLERS);
 #endif
@@ -1038,15 +1057,42 @@ intel_gpgpu_event_delete(intel_event_t *event)
   cl_free(event);
 }
 
+/* We want to get the current time of GPU. */
 static void
-intel_gpgpu_event_get_timestamp(intel_event_t *event, int index, uint64_t* ret_ts)
+intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts)
 {
+  uint64_t result = 0;
+  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+
+  drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result);
+  result = result & 0xFFFFFFFFF0000000;
+  result = result >> 28;
+  result *= 80;
+
+  *ret_ts = result;
+  return;
+}
+
+/* Get the GPU execute time. */
+static void
+intel_gpgpu_event_get_exec_timestamp(intel_event_t *event,
+                                int index, uint64_t* ret_ts)
+{
+  uint64_t result = 0;
+
   assert(event->ts_buf != NULL);
   assert(index == 0 || index == 1);
   drm_intel_gem_bo_map_gtt(event->ts_buf);
   uint64_t* ptr = event->ts_buf->virtual;
+  result = ptr[index];
+
+  /* According to BSpec, the timestamp counter should be 36 bits,
+     but comparing to the timestamp counter from IO control reading,
+     we find the first 4 bits seems to be fake. In order to keep the
+     timestamp counter conformable, we just skip the first 4 bits. */
+  result = ((result & 0x0FFFFFFFF) << 4) * 80; //convert to nanoseconds
+  *ret_ts = result;
 
-  *ret_ts = ptr[index] * 80; //convert to nanoseconds
   drm_intel_gem_bo_unmap_gtt(event->ts_buf);
 }
 
@@ -1077,6 +1123,9 @@ intel_set_gpgpu_callbacks(void)
   cl_gpgpu_event_pending = (cl_gpgpu_event_pending_cb *)intel_gpgpu_event_pending;
   cl_gpgpu_event_resume = (cl_gpgpu_event_resume_cb *)intel_gpgpu_event_resume;
   cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
-  cl_gpgpu_event_get_timestamp = (cl_gpgpu_event_get_timestamp_cb *)intel_gpgpu_event_get_timestamp;
+  cl_gpgpu_event_get_exec_timestamp = (cl_gpgpu_event_get_exec_timestamp_cb *)intel_gpgpu_event_get_exec_timestamp;
+  cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp;
+  cl_gpgpu_ref_batch_buf = (cl_gpgpu_ref_batch_buf_cb *)intel_gpgpu_ref_batch_buf;
+  cl_gpgpu_unref_batch_buf = (cl_gpgpu_unref_batch_buf_cb *)intel_gpgpu_unref_batch_buf;
 }
 
diff --git a/utests/.gitignore b/utests/.gitignore
index 9a374dc..90f80fc 100644
--- a/utests/.gitignore
+++ b/utests/.gitignore
@@ -11,3 +11,5 @@ compiler_ribbon.bmp
 flat_address_space
 libutests.so
 utest_run
+generated
+utest_generator.pyc
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 37240fe..0614ee6 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -1,9 +1,25 @@
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
                     ${CMAKE_CURRENT_SOURCE_DIR}/../include)
 
+##### Math Function Part:
+EXEC_PROGRAM(mkdir ${CMAKE_CURRENT_SOURCE_DIR} ARGS generated -p)
+EXEC_PROGRAM(python ${CMAKE_CURRENT_SOURCE_DIR} ARGS utest_math_gen.py OUTPUT_VARIABLE GEN_MATH_STRING)
+string(REGEX REPLACE " " ";" ADDMATHFUNC ${GEN_MATH_STRING})
+string(REGEX REPLACE " " "\n" NAMEMATHLIST ${GEN_MATH_STRING})
+MESSAGE(STATUS "Generated Builtin Math Functions:\n" ${NAMEMATHLIST})
+
+string(REGEX REPLACE "generated/([^\ ]*)\\.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../kernels/\\1.cl" KERNEL_MATH_LIST ${GEN_MATH_STRING})
+string(REGEX REPLACE " " ";" KERNEL_MATH_LIST ${KERNEL_MATH_LIST})
+string(REGEX REPLACE "generated/([^\ ]*)\\.cpp" "\\1.cl" KERNEL_GITIGNORE_LIST ${GEN_MATH_STRING})
+set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "generated;${KERNEL_MATH_LIST}")
+
+configure_file (
+  "setenv.sh.in"
+  "setenv.sh"
+  )
+
 link_directories (${LLVM_LIBRARY_DIR})
 set (utests_sources
-  cl_create_kernel.cpp
   utest_error.c
   compiler_basic_arithmetic.cpp
   compiler_displacement_map_element.cpp
@@ -53,7 +69,7 @@ set (utests_sources
   compiler_if_else.cpp
   compiler_integer_division.cpp
   compiler_integer_remainder.cpp
-	compiler_insert_vector.cpp
+  compiler_insert_vector.cpp
   compiler_lower_return0.cpp
   compiler_lower_return1.cpp
   compiler_lower_return2.cpp
@@ -129,7 +145,10 @@ set (utests_sources
   builtin_num_groups.cpp
   builtin_local_id.cpp
   builtin_acos_asin.cpp
+  builtin_pow.cpp
+  builtin_exp.cpp
   builtin_convert_sat.cpp
+  sub_buffer.cpp
   runtime_createcontext.cpp
   runtime_null_kernel_arg.cpp
   runtime_event.cpp
@@ -145,7 +164,9 @@ set (utests_sources
   compiler_long_asr.cpp
   compiler_long_mult.cpp
   compiler_long_cmp.cpp
+  compiler_function_argument3.cpp
   compiler_bool_cross_basic_block.cpp
+  compiler_private_data_overflow.cpp
   load_program_from_bin.cpp
   enqueue_copy_buf.cpp
   utest_assert.cpp
@@ -156,26 +177,38 @@ set (utests_sources
 SET (kernel_bin ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/compiler_ceil)
 ADD_CUSTOM_COMMAND(
     OUTPUT ${kernel_bin}.bin
-    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl -o${kernel_bin}.bin
+    COMMAND ${GBE_BIN_GENERATER} ${kernel_bin}.cl -o${kernel_bin}.bin
     DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl
     )
 
 ADD_CUSTOM_TARGET(kernel_bin.bin
     DEPENDS ${kernel_bin}.bin)
 
+add_custom_command(OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/generated
+    COMMAND mkdir ${CMAKE_CURRENT_SOURCE_DIR}/generated -p
+    COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/utest_math_gen.py > /dev/null 2>&1
+    COMMAND echo ${KERNEL_GITIGNORE_LIST} |sed 's/ /\\n/g' > ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/.gitignore
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    )
+add_custom_target(utest_generator
+    DEPENDS generated
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    )
+
 if (EGL_FOUND AND MESA_SOURCE_FOUND)
 SET(utests_sources ${utests_sources} compiler_fill_gl_image.cpp)
-SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS}")
-SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS}")
+SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
+SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
 endif (EGL_FOUND AND MESA_SOURCE_FOUND)
 
-ADD_LIBRARY(utests SHARED ${utests_sources})
+ADD_LIBRARY(utests SHARED ${ADDMATHFUNC} ${utests_sources})
 
 TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 
 ADD_EXECUTABLE(utest_run utest_run.cpp)
 TARGET_LINK_LIBRARIES(utest_run utests)
 ADD_DEPENDENCIES (utest_run kernel_bin.bin)
+ADD_DEPENDENCIES (utests utest_generator)
 
 ADD_EXECUTABLE(flat_address_space runtime_flat_address_space.cpp)
 TARGET_LINK_LIBRARIES(flat_address_space utests)
diff --git a/utests/builtin_exp.cpp b/utests/builtin_exp.cpp
new file mode 100644
index 0000000..d5288c8
--- /dev/null
+++ b/utests/builtin_exp.cpp
@@ -0,0 +1,102 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+#define udebug 0
+
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define FLT_ULP  (1.0e-6f)
+
+#define printf_c(...) \
+{\
+  printf("\033[1m\033[40;31m");\
+  printf( __VA_ARGS__ );\
+  printf("\033[0m");\
+}
+
+const float input_data[] = {FLT_MAX, -FLT_MAX, FLT_MIN, -FLT_MIN, 80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0 };
+const int count_input = sizeof(input_data) / sizeof(input_data[0]);
+const int max_function = 5;
+
+static void cpu_compiler_math(float *dst, const float *src)
+{
+  const float x = *src;
+
+  dst[0] = exp(x);
+  dst[1] = exp2(x);
+  dst[2] = exp10(x);
+  dst[3] = expm1(x);
+  dst[4] = x;
+}
+
+static void builtin_exp(void)
+{
+  // Setup kernel and buffers
+  int k, i, index_cur;
+  float gpu_data[max_function * count_input] = {0}, cpu_data[max_function * count_input] = {0};
+  float diff;
+  char log[256] = {0};
+
+  OCL_CREATE_KERNEL("builtin_exp");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, count_input * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+  globals[0] = count_input;
+  locals[0] = 1;
+
+  clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * sizeof(float), input_data, 0, NULL, NULL);
+  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), &max_function , 0, NULL, NULL);
+
+   // Run the kernel
+  OCL_NDRANGE( 1 );
+
+  clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(float) * max_function * count_input, gpu_data, 0, NULL, NULL);
+
+  for (k = 0; (uint)k < count_input; k++)
+  {
+    cpu_compiler_math( cpu_data + k * max_function, input_data + k);
+
+    for (i = 0; i < max_function; i++)
+    {
+      index_cur = k * max_function + i;
+      diff = fabs(gpu_data[index_cur]-cpu_data[index_cur]);
+      sprintf(log, "%d/%d: %f -> gpu:%f  cpu:%f diff:%f expect:%f\n", \
+         k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur], \
+         diff/gpu_data[index_cur], 3 * FLT_ULP);
+
+#if udebug
+      if (isinf(cpu_data[index_cur]) && isinf(gpu_data[index_cur])){
+        printf(log);
+      }
+      else if (isnan(cpu_data[index_cur]) && isnan(gpu_data[index_cur])){
+        printf(log);
+      }
+      else if( diff / cpu_data[index_cur] < 3 * FLT_ULP \
+        && ( gpu_data[index_cur] > FLT_ULP  || cpu_data[index_cur] > FLT_ULP )){
+        printf(log);
+      }
+      else if ( gpu_data[index_cur] < FLT_ULP && gpu_data[index_cur] < FLT_ULP)
+        printf(log);
+      else
+        printf_c(log);
+#else
+      if (isinf(cpu_data[index_cur]))
+        OCL_ASSERTM(isinf(gpu_data[index_cur]), log);
+      else if (isnan(cpu_data[index_cur]))
+        OCL_ASSERTM(isnan(gpu_data[index_cur]), log);
+      else if ( gpu_data[index_cur] > FLT_ULP || cpu_data[index_cur] > FLT_ULP)
+        OCL_ASSERTM(fabs( diff / cpu_data[index_cur]) < 3 * FLT_ULP, log);
+      else
+        OCL_ASSERTM(fabs(diff) < 3 * FLT_ULP, log);
+#endif
+    }
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_exp)
diff --git a/utests/builtin_pow.cpp b/utests/builtin_pow.cpp
new file mode 100644
index 0000000..8ed17ed
--- /dev/null
+++ b/utests/builtin_pow.cpp
@@ -0,0 +1,92 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+#define udebug 0
+#define printf_c(...) \
+{\
+  printf("\033[1m\033[40;31m");\
+  printf( __VA_ARGS__ );\
+  printf("\033[0m");\
+}
+const float ori_data[] = {-20.5, -1, -0.9, -0.01, 0, 0.01, 0.9, 1.0, 20.5};
+const int count_input_ori = sizeof(ori_data) / sizeof(ori_data[0]);
+const int count_input = count_input_ori * count_input_ori;
+
+float input_data1[count_input];
+float input_data2[count_input];
+const int max_function = 1;
+
+static void cpu_compiler_math(const float *src1, const float *src2, float *dst)
+{
+  dst[0] = powf(src1[0], src2[0]);
+//  dst[1] = src1[0];
+}
+
+static void builtin_pow(void)
+{
+  // Setup kernel and buffers
+  int k, i, index_cur;
+  float gpu_data[max_function * count_input] = {0}, cpu_data[max_function * count_input] = {0};
+
+  for(i=0; i<count_input_ori;i++)
+    for(k=0; k<count_input_ori;k++)
+    {
+      input_data1[i*count_input_ori+k] = ori_data[i];
+      input_data2[i*count_input_ori+k] = ori_data[k];
+    }
+
+  OCL_CREATE_KERNEL("builtin_pow");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+
+  globals[0] = count_input;
+  locals[0] = 1;
+
+  clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * sizeof(float), input_data1, 0, NULL, NULL);
+  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, count_input * sizeof(float), input_data2, 0, NULL, NULL);
+  clEnqueueWriteBuffer( queue, buf[3], CL_TRUE, 0, sizeof(int), &max_function, 0, NULL, NULL);
+
+   // Run the kernel
+  OCL_NDRANGE( 1 );
+
+  clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(float) * max_function * count_input, gpu_data, 0, NULL, NULL);
+
+  for (k = 0; (uint)k < count_input; k++)
+  {
+    cpu_compiler_math( input_data1 + k, input_data2 + k, cpu_data + k * max_function);
+
+    for (i = 0; i < max_function; i++)
+    {
+      index_cur = k * max_function + i;
+#if udebug
+      if ( (isinf(cpu_data[index_cur]) && !isinf(gpu_data[index_cur])) ||
+           (isnan(cpu_data[index_cur]) && !isnan(gpu_data[index_cur])) ||
+           (fabs(gpu_data[index_cur] - cpu_data[index_cur]) > 1e-5f)   )
+      {
+        printf_c("%d/%d: x:%f, y:%f -> gpu:%f  cpu:%f\n", k, i, input_data1[k], input_data2[k], gpu_data[index_cur], cpu_data[index_cur]);
+      }
+      else
+        printf("%d/%d: x:%f, y:%f -> gpu:%f  cpu:%f\n", k, i, input_data1[k], input_data2[k], gpu_data[index_cur], cpu_data[index_cur]);
+#else
+     if (isinf(cpu_data[index_cur]))
+       OCL_ASSERT(isinf(gpu_data[index_cur]));
+     else if (isnan(cpu_data[index_cur]))
+       OCL_ASSERT(isnan(gpu_data[index_cur]));
+     else
+     {
+       OCL_ASSERT(fabs(gpu_data[index_cur] - cpu_data[index_cur]) < 1e-3f);
+     }
+#endif
+    }
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(builtin_pow)
diff --git a/utests/compiler_abs.cpp b/utests/compiler_abs.cpp
index 9457b9b..3f477a8 100644
--- a/utests/compiler_abs.cpp
+++ b/utests/compiler_abs.cpp
@@ -166,13 +166,16 @@ template <typename T, typename U> static void compiler_abs_with_type(void)
     }
 }
 
-#define ABS_TEST_TYPE(TYPE, UTYPE) \
+#define ABS_TEST_TYPE_1(TYPE, UTYPE, KEEP_PROGRAM) \
 	static void compiler_abs_##TYPE (void) \
         { \
            OCL_CALL (cl_kernel_init, "compiler_abs.cl", "compiler_abs_"#TYPE, SOURCE, NULL);  \
            compiler_abs_with_type<TYPE, UTYPE>(); \
         } \
-	MAKE_UTEST_FROM_FUNCTION(compiler_abs_##TYPE);
+	MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_abs_##TYPE, KEEP_PROGRAM);
+
+#define ABS_TEST_TYPE(TYPE, UTYPE) ABS_TEST_TYPE_1(TYPE, UTYPE, true)
+#define ABS_TEST_TYPE_END(TYPE, UTYPE) ABS_TEST_TYPE_1(TYPE, UTYPE, false)
 
 typedef unsigned char uchar;
 typedef unsigned short ushort;
@@ -248,4 +251,4 @@ ABS_TEST_TYPE(ushort2, ushort2)
 ABS_TEST_TYPE(ushort3, ushort3)
 ABS_TEST_TYPE(ushort4, ushort4)
 ABS_TEST_TYPE(ushort8, ushort8)
-ABS_TEST_TYPE(ushort16, ushort16)
+ABS_TEST_TYPE_END(ushort16, ushort16)
diff --git a/utests/compiler_abs_diff.cpp b/utests/compiler_abs_diff.cpp
index 71881b1..15a1f90 100644
--- a/utests/compiler_abs_diff.cpp
+++ b/utests/compiler_abs_diff.cpp
@@ -183,26 +183,29 @@ template <typename T, typename U> static void compiler_abs_diff_with_type(void)
 }
 
 
-#define ABS_TEST_DIFF_TYPE_2(TYPE, CLTYPE, UTYPE) \
+#define ABS_TEST_DIFF_TYPE_2(TYPE, CLTYPE, UTYPE, KEEP_PROGRAM) \
 	static void compiler_abs_diff_##CLTYPE (void) \
         { \
            OCL_CALL (cl_kernel_init, "compiler_abs_diff.cl", "compiler_abs_diff_"#CLTYPE, SOURCE, NULL);  \
            compiler_abs_diff_with_type<TYPE, UTYPE>(); \
         } \
-	MAKE_UTEST_FROM_FUNCTION(compiler_abs_diff_##CLTYPE);
+	MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_abs_diff_##CLTYPE, KEEP_PROGRAM);
+
+#define ABS_TEST_DIFF_TYPE(TYPE, UTYPE) ABS_TEST_DIFF_TYPE_2(TYPE, TYPE, UTYPE, true)
+
+#define ABS_TEST_DIFF_TYPE_END(TYPE, UTYPE) ABS_TEST_DIFF_TYPE_2(TYPE, TYPE, UTYPE, false)
 
-#define ABS_TEST_DIFF_TYPE(TYPE, UTYPE) ABS_TEST_DIFF_TYPE_2(TYPE, TYPE, UTYPE)
 
 typedef unsigned char uchar;
 typedef unsigned short ushort;
 typedef unsigned int uint;
 typedef uint64_t ulong64;
 ABS_TEST_DIFF_TYPE(int, uint)
-ABS_TEST_DIFF_TYPE_2(int64_t, long, ulong64)
+ABS_TEST_DIFF_TYPE_2(int64_t, long, ulong64, true)
 ABS_TEST_DIFF_TYPE(short, ushort)
 ABS_TEST_DIFF_TYPE(char, uchar)
 ABS_TEST_DIFF_TYPE(uint, uint)
-ABS_TEST_DIFF_TYPE_2(ulong64, ulong, ulong64)
+ABS_TEST_DIFF_TYPE_2(ulong64, ulong, ulong64, true)
 ABS_TEST_DIFF_TYPE(ushort, ushort)
 ABS_TEST_DIFF_TYPE(uchar, uchar)
 
@@ -289,4 +292,4 @@ ABS_TEST_DIFF_TYPE(ushort2, ushort2)
 ABS_TEST_DIFF_TYPE(ushort3, ushort3)
 ABS_TEST_DIFF_TYPE(ushort4, ushort4)
 ABS_TEST_DIFF_TYPE(ushort8, ushort8)
-ABS_TEST_DIFF_TYPE(ushort16, ushort16)
+ABS_TEST_DIFF_TYPE_END(ushort16, ushort16)
diff --git a/utests/compiler_basic_arithmetic.cpp b/utests/compiler_basic_arithmetic.cpp
index dcdd084..0e5ec41 100644
--- a/utests/compiler_basic_arithmetic.cpp
+++ b/utests/compiler_basic_arithmetic.cpp
@@ -61,52 +61,56 @@ std::cout <<"kernel name: " << kernel_name << std::endl;
   buf_data[0] = buf_data[1] = NULL;
 }
 
-#define DECL_TEST_SUB(type, alias) \
+#define DECL_TEST_SUB(type, alias, keep_program) \
 static void compiler_sub_ ##alias(void)\
 {\
   test_exec<type, TEST_OP_SUB>("compiler_sub_" # alias);\
 }\
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_ ## alias)
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_sub_ ## alias, keep_program)
 
-#define DECL_TEST_ADD(type, alias) \
+#define DECL_TEST_ADD(type, alias, keep_program) \
 static void compiler_add_ ##alias(void)\
 {\
   test_exec<type, TEST_OP_ADD>("compiler_add_" # alias);\
 }\
-MAKE_UTEST_FROM_FUNCTION(compiler_add_ ## alias)
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_add_ ## alias, keep_program)
 
-#define DECL_TEST_MUL(type, alias) \
+#define DECL_TEST_MUL(type, alias, keep_program) \
 static void compiler_mul_ ##alias(void)\
 {\
   test_exec<type, TEST_OP_MUL>("compiler_mul_" # alias);\
 }\
-MAKE_UTEST_FROM_FUNCTION(compiler_mul_ ## alias)
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_mul_ ## alias, keep_program)
 
-#define DECL_TEST_DIV(type, alias) \
+#define DECL_TEST_DIV(type, alias, keep_program) \
 static void compiler_div_ ##alias(void)\
 {\
   test_exec<type, TEST_OP_DIV>("compiler_div_" # alias);\
 }\
-MAKE_UTEST_FROM_FUNCTION(compiler_div_ ## alias)
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_div_ ## alias, keep_program)
 
-#define DECL_TEST_REM(type, alias) \
+#define DECL_TEST_REM(type, alias, keep_program) \
 static void compiler_rem_ ##alias(void)\
 {\
   test_exec<type, TEST_OP_REM>("compiler_rem_" # alias);\
 }\
-MAKE_UTEST_FROM_FUNCTION(compiler_rem_ ## alias)
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_rem_ ## alias, keep_program)
 
-#define DECL_TEST_FOR_ALL_TYPE(op)\
-DECL_TEST_##op(int8_t, char) \
-DECL_TEST_##op(uint8_t, uchar) \
-DECL_TEST_##op(int16_t, short) \
-DECL_TEST_##op(uint16_t, ushort) \
-DECL_TEST_##op(int32_t, int) \
-DECL_TEST_##op(uint32_t, uint)
+#define _DECL_TEST_FOR_ALL_TYPE(op, keep_program) \
+DECL_TEST_##op(int8_t, char, true) \
+DECL_TEST_##op(uint8_t, uchar, true) \
+DECL_TEST_##op(int16_t, short, true) \
+DECL_TEST_##op(uint16_t, ushort, true) \
+DECL_TEST_##op(int32_t, int, true) \
+DECL_TEST_##op(uint32_t, uint, keep_program)
+
+#define DECL_TEST_FOR_ALL_TYPE(op) _DECL_TEST_FOR_ALL_TYPE(op, true)
+
+#define DECL_TEST_FOR_ALL_TYPE_END(op) _DECL_TEST_FOR_ALL_TYPE(op, false)
 
 DECL_TEST_FOR_ALL_TYPE(SUB)
 DECL_TEST_FOR_ALL_TYPE(ADD)
 DECL_TEST_FOR_ALL_TYPE(MUL)
 DECL_TEST_FOR_ALL_TYPE(DIV)
-DECL_TEST_FOR_ALL_TYPE(REM)
+DECL_TEST_FOR_ALL_TYPE_END(REM)
 #undef DECL_TEST_FOR_ALL_TYPE
diff --git a/utests/compiler_bool_cross_basic_block.cpp b/utests/compiler_bool_cross_basic_block.cpp
index 4dd5bc7..908edc0 100644
--- a/utests/compiler_bool_cross_basic_block.cpp
+++ b/utests/compiler_bool_cross_basic_block.cpp
@@ -52,4 +52,4 @@ void compiler_bool_cross_basic_block(void){
 
 }
 
-MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_bool_cross_basic_block)
+MAKE_UTEST_FROM_FUNCTION(compiler_bool_cross_basic_block)
diff --git a/utests/compiler_function_argument3.cpp b/utests/compiler_function_argument3.cpp
new file mode 100644
index 0000000..e9f5e80
--- /dev/null
+++ b/utests/compiler_function_argument3.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+struct sfloat8 {
+    float a;
+    float b;
+    float c;
+    float d;
+    float e;
+    float f;
+    float g;
+    float h;
+};
+
+void compiler_function_argument3(void)
+{
+  sfloat8 arg6;
+
+  arg6.a = 3.0f;
+  arg6.h = 4.0f;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_function_argument3");
+  OCL_CREATE_BUFFER(buf[0], 0, sizeof(struct sfloat8) * 8, NULL);
+
+  OCL_SET_ARG(0, sizeof(arg6), &arg6);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = 1;
+  locals[0] = 1;
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(0);
+
+  /* Check results */
+  sfloat8 *dst = (sfloat8*)buf_data[0];
+
+  OCL_ASSERT(dst[0].a == 3.0f);
+  OCL_ASSERT(dst[0].b == 12.0f);
+  OCL_ASSERT(dst[0].h == 7.0f);
+
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_argument3);
diff --git a/utests/compiler_global_constant.cpp b/utests/compiler_global_constant.cpp
index a2d0172..88f9852 100644
--- a/utests/compiler_global_constant.cpp
+++ b/utests/compiler_global_constant.cpp
@@ -98,7 +98,7 @@ void compiler_global_constant3(void)
   OCL_UNMAP_BUFFER(0);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_global_constant);
-MAKE_UTEST_FROM_FUNCTION(compiler_global_constant1);
-MAKE_UTEST_FROM_FUNCTION(compiler_global_constant2);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_global_constant, true);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_global_constant1, true);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_global_constant2, true);
 MAKE_UTEST_FROM_FUNCTION(compiler_global_constant3);
diff --git a/utests/compiler_group_size.cpp b/utests/compiler_group_size.cpp
index 0c8881c..8ad83f0 100644
--- a/utests/compiler_group_size.cpp
+++ b/utests/compiler_group_size.cpp
@@ -134,8 +134,8 @@ void compiler_group_size4(void)
     OCL_UNMAP_BUFFER(1);
   }
 }
-MAKE_UTEST_FROM_FUNCTION(compiler_group_size1);
-MAKE_UTEST_FROM_FUNCTION(compiler_group_size2);
-MAKE_UTEST_FROM_FUNCTION(compiler_group_size3);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_group_size1, true);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_group_size2, true);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_group_size3, true);
 MAKE_UTEST_FROM_FUNCTION(compiler_group_size4);
 
diff --git a/utests/compiler_long.cpp b/utests/compiler_long.cpp
index d7e1517..b525694 100644
--- a/utests/compiler_long.cpp
+++ b/utests/compiler_long.cpp
@@ -8,6 +8,7 @@ void compiler_long(void)
   const size_t n = 16;
   int64_t src1[n], src2[n];
 
+  int64_t zero = 0;
   // Setup kernel and buffers
   OCL_CREATE_KERNEL("compiler_long");
   OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
@@ -16,6 +17,7 @@ void compiler_long(void)
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
   OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_long), &zero);
   globals[0] = n;
   locals[0] = 16;
 
diff --git a/utests/compiler_long_cmp.cpp b/utests/compiler_long_cmp.cpp
index 3775556..35d4c4f 100644
--- a/utests/compiler_long_cmp.cpp
+++ b/utests/compiler_long_cmp.cpp
@@ -47,6 +47,7 @@ void compiler_long_cmp(void)
     OCL_ASSERT(x == dest[i]);
   }
   OCL_UNMAP_BUFFER(2);
+  OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
 
   OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_le");
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -60,6 +61,7 @@ void compiler_long_cmp(void)
     OCL_ASSERT(x == dest[i]);
   }
   OCL_UNMAP_BUFFER(2);
+  OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
 
   OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_g");
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -73,6 +75,7 @@ void compiler_long_cmp(void)
     OCL_ASSERT(x == dest[i]);
   }
   OCL_UNMAP_BUFFER(2);
+  OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
 
   OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_ge");
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -86,6 +89,7 @@ void compiler_long_cmp(void)
     OCL_ASSERT(x == dest[i]);
   }
   OCL_UNMAP_BUFFER(2);
+  OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
 
   OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_eq");
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
@@ -99,6 +103,7 @@ void compiler_long_cmp(void)
     OCL_ASSERT(x == dest[i]);
   }
   OCL_UNMAP_BUFFER(2);
+  OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
 
   OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_neq");
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
diff --git a/utests/compiler_long_convert.cpp b/utests/compiler_long_convert.cpp
index 827a45b..ada6926 100644
--- a/utests/compiler_long_convert.cpp
+++ b/utests/compiler_long_convert.cpp
@@ -65,7 +65,7 @@ void compiler_long_convert(void)
   OCL_UNMAP_BUFFER(5);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_long_convert);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_long_convert, true);
 
 // convert 64-bit integer to shorter integer
 void compiler_long_convert_2(void)
@@ -115,7 +115,7 @@ void compiler_long_convert_2(void)
   OCL_UNMAP_BUFFER(2);
 }
 
-MAKE_UTEST_FROM_FUNCTION(compiler_long_convert_2);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_long_convert_2, true);
 
 // convert 64-bit integer to 32-bit float
 void compiler_long_convert_to_float(void)
diff --git a/utests/compiler_private_data_overflow.cpp b/utests/compiler_private_data_overflow.cpp
new file mode 100644
index 0000000..0fa30a0
--- /dev/null
+++ b/utests/compiler_private_data_overflow.cpp
@@ -0,0 +1,15 @@
+#include "utest_helper.hpp"
+
+void compiler_private_data_overflow(void)
+{
+	OCL_CREATE_KERNEL( "compiler_private_data_overflow" );
+	OCL_CREATE_BUFFER( buf[0], 0, sizeof(cl_int4), NULL );
+	OCL_SET_ARG( 0, sizeof(cl_mem), &buf[0] );
+	globals[0] = 64;
+	locals[0] = 32;
+	OCL_NDRANGE(1);
+	OCL_MAP_BUFFER(0);
+	OCL_ASSERT( ((uint32_t *)buf_data[0])[0] == 0 );
+	OCL_UNMAP_BUFFER(0);
+}
+MAKE_UTEST_FROM_FUNCTION( compiler_private_data_overflow );
diff --git a/utests/compiler_step.cpp b/utests/compiler_step.cpp
index 3285dda..b022826 100644
--- a/utests/compiler_step.cpp
+++ b/utests/compiler_step.cpp
@@ -322,17 +322,21 @@ template <typename T> static void compiler_stepf_with_type(void)
     }
 }
 
-#define STEPF_TEST_TYPE(TYPE) \
+#define _STEPF_TEST_TYPE(TYPE, keep_program) \
 	static void compiler_stepf_##TYPE (void) \
         { \
            OCL_CALL (cl_kernel_init, "compiler_step.cl", "compiler_stepf_"#TYPE, SOURCE, NULL);  \
            compiler_stepf_with_type<TYPE>(); \
         } \
-	MAKE_UTEST_FROM_FUNCTION(compiler_stepf_##TYPE);
+	MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_stepf_##TYPE, keep_program);
+
+#define STEPF_TEST_TYPE(TYPE) _STEPF_TEST_TYPE(TYPE, true)
+#define STEPF_TEST_TYPE_END(TYPE) _STEPF_TEST_TYPE(TYPE, false)
+
 
 STEPF_TEST_TYPE(float)
 STEPF_TEST_TYPE(float2)
 STEPF_TEST_TYPE(float3)
 STEPF_TEST_TYPE(float4)
 STEPF_TEST_TYPE(float8)
-STEPF_TEST_TYPE(float16)
+STEPF_TEST_TYPE_END(float16)
diff --git a/utests/compiler_vector_load_store.cpp b/utests/compiler_vector_load_store.cpp
index f8a3dcb..b44abc7 100644
--- a/utests/compiler_vector_load_store.cpp
+++ b/utests/compiler_vector_load_store.cpp
@@ -37,27 +37,27 @@ static void compiler_vector_load_store(int elemNum, const char *kernelName)
   OCL_UNMAP_BUFFER(1);
 }
 
-#define compiler_vector_load_store(type, n, kernel_type) \
+#define compiler_vector_load_store(type, n, kernel_type, keep_program) \
 static void compiler_vector_ ##kernel_type ##n ##_load_store(void)\
 {\
   compiler_vector_load_store<type>(n, "test_" #kernel_type #n);\
 }\
-MAKE_UTEST_FROM_FUNCTION(compiler_vector_ ## kernel_type ##n ##_load_store);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_vector_ ## kernel_type ##n ##_load_store, keep_program);
 
-#define test_all_vector(type, kernel_type) \
-  compiler_vector_load_store(type, 2, kernel_type) \
-  compiler_vector_load_store(type, 3, kernel_type) \
-  compiler_vector_load_store(type, 4, kernel_type) \
-  compiler_vector_load_store(type, 8, kernel_type) \
-  compiler_vector_load_store(type, 16, kernel_type)
+#define test_all_vector(type, kernel_type, keep_program) \
+  compiler_vector_load_store(type, 2, kernel_type, true) \
+  compiler_vector_load_store(type, 3, kernel_type, true) \
+  compiler_vector_load_store(type, 4, kernel_type, true) \
+  compiler_vector_load_store(type, 8, kernel_type, true) \
+  compiler_vector_load_store(type, 16, kernel_type, keep_program)
 
-test_all_vector(int8_t, char)
-test_all_vector(uint8_t, uchar)
-test_all_vector(int16_t, short)
-test_all_vector(uint16_t, ushort)
-test_all_vector(int32_t, int)
-test_all_vector(uint32_t, uint)
-test_all_vector(float, float)
-test_all_vector(double, double)
-test_all_vector(int64_t, long)
-test_all_vector(uint64_t, ulong)
+test_all_vector(int8_t, char, true)
+test_all_vector(uint8_t, uchar, true)
+test_all_vector(int16_t, short, true)
+test_all_vector(uint16_t, ushort, true)
+test_all_vector(int32_t, int, true)
+test_all_vector(uint32_t, uint, true)
+test_all_vector(float, float, true)
+test_all_vector(double, double, true)
+test_all_vector(int64_t, long, true)
+test_all_vector(uint64_t, ulong, false)
diff --git a/utests/get_cl_info.cpp b/utests/get_cl_info.cpp
index ec02ce9..4148ce9 100644
--- a/utests/get_cl_info.cpp
+++ b/utests/get_cl_info.cpp
@@ -548,8 +548,18 @@ void get_mem_info(void)
 {
     map<cl_mem_info, void *> maps;
     int expect_ref;
+    cl_mem sub_buf;
+    cl_int error;
+
+    OCL_CREATE_BUFFER(buf[1], 0, 4096, NULL);
+
+    cl_buffer_region region;
+    region.origin = 1024;
+    region.size = 2048;
+    sub_buf = clCreateSubBuffer(buf[1], 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &error );
+    buf[0] = sub_buf;
+    OCL_ASSERT(error == CL_SUCCESS);
 
-    OCL_CREATE_BUFFER(buf[0], 0, 64, NULL);
     void * map_ptr = clEnqueueMapBuffer(queue, buf[0], 1, CL_MAP_READ, 0, 64, 0, NULL, NULL, NULL);
 
     expect_ref = CL_MEM_OBJECT_BUFFER;
@@ -558,7 +568,7 @@ void get_mem_info(void)
     expect_ref = 0;
     maps.insert(make_pair(CL_MEM_FLAGS,
                           (void *)(new Info_Result<cl_mem_flags>(expect_ref))));
-    expect_ref = 64;
+    expect_ref = 2048;
     maps.insert(make_pair(CL_MEM_SIZE,
                           (void *)(new Info_Result<size_t>(((size_t)expect_ref)))));
     expect_ref = 0;
@@ -572,6 +582,11 @@ void get_mem_info(void)
                           (void *)(new Info_Result<cl_uint>(((cl_uint)expect_ref)))));
     maps.insert(make_pair(CL_MEM_CONTEXT,
                           (void *)(new Info_Result<cl_context>(((cl_context)ctx)))));
+    maps.insert(make_pair(CL_MEM_ASSOCIATED_MEMOBJECT,
+                          (void *)(new Info_Result<cl_mem>(((cl_mem)buf[1])))));
+    expect_ref = 1024;
+    maps.insert(make_pair(CL_MEM_OFFSET,
+                          (void *)(new Info_Result<size_t>(((size_t)expect_ref)))));
 
     std::for_each(maps.begin(), maps.end(), [](pair<cl_mem_info, void *> x) {
         switch (x.first) {
@@ -596,6 +611,12 @@ void get_mem_info(void)
         case CL_MEM_CONTEXT:
             CALL_GETMEMINFO_AND_RET(cl_context);
             break;
+        case CL_MEM_ASSOCIATED_MEMOBJECT:
+            CALL_GETMEMINFO_AND_RET(cl_mem);
+            break;
+        case CL_MEM_OFFSET:
+            CALL_GETMEMINFO_AND_RET(size_t);
+            break;
 
         default:
             break;
diff --git a/utests/runtime_createcontext.cpp b/utests/runtime_createcontext.cpp
index b90e915..f08a189 100644
--- a/utests/runtime_createcontext.cpp
+++ b/utests/runtime_createcontext.cpp
@@ -3,9 +3,12 @@
 void runtime_createcontextfromtype(void) {
   cl_int status;
 
-  if (clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU, NULL, NULL, &status) == NULL) {
+  cl_context ctx;
+  ctx = clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
+  if (ctx == NULL) {
     OCL_THROW_ERROR("runtime_createcontextfromtype", status);
   }
+  clReleaseContext(ctx);
 }
 
 MAKE_UTEST_FROM_FUNCTION(runtime_createcontextfromtype);
diff --git a/utests/setenv.sh.in b/utests/setenv.sh.in
new file mode 100644
index 0000000..ad77369
--- /dev/null
+++ b/utests/setenv.sh.in
@@ -0,0 +1,5 @@
+#!/bin/sh
+#
+export OCL_PCM_PATH=@LOCAL_PCM_OBJECT_DIR@
+export OCL_PCH_PATH=@LOCAL_PCH_OBJECT_DIR@
+export OCL_KERNEL_PATH=@CMAKE_CURRENT_SOURCE_DIR@/../kernels
diff --git a/utests/sub_buffer.cpp b/utests/sub_buffer.cpp
new file mode 100644
index 0000000..f65e8ff
--- /dev/null
+++ b/utests/sub_buffer.cpp
@@ -0,0 +1,135 @@
+#include "utest_helper.hpp"
+
+void sub_bufffer_check(void)
+{
+    cl_int error;
+    cl_ulong max_alloc_size;
+    cl_uint address_align;
+    cl_mem main_buf;
+    cl_mem sub_buf;
+    char *main_buf_content;
+    char sub_buf_content[32];
+
+    error = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_alloc_size), &max_alloc_size, NULL);
+    OCL_ASSERT(error == CL_SUCCESS);
+    error = clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(address_align ), &address_align, NULL );
+    OCL_ASSERT(error == CL_SUCCESS);
+
+    main_buf_content = (char *)malloc(sizeof(char) * max_alloc_size);
+
+    for (cl_ulong i = 0; i < max_alloc_size; i++) {
+        main_buf_content[i] = rand() & 63;
+    }
+
+    main_buf = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, max_alloc_size, main_buf_content, &error);
+    OCL_ASSERT(error == CL_SUCCESS);
+
+    /* Test read sub buffer. */
+    for (cl_ulong sz = 64; sz < max_alloc_size; sz*=4) {
+        for (cl_ulong off = 0; off < max_alloc_size; off += 1234) {
+            cl_buffer_region region;
+            region.origin = off;
+            region.size = sz;
+
+            sub_buf = clCreateSubBuffer(main_buf, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &error );
+
+            /* invalid size, should be failed. */
+            if(off + sz > max_alloc_size) {
+                OCL_ASSERT(error != CL_SUCCESS);
+                continue;
+            }
+            /* invalid align, should be failed. */
+            if(off & (address_align-1)) {
+                OCL_ASSERT(error != CL_SUCCESS);
+                continue;
+            }
+
+            OCL_ASSERT(error == CL_SUCCESS);
+
+            error = clEnqueueReadBuffer(queue, sub_buf, CL_TRUE, 0, 32, (void *)sub_buf_content, 0, NULL, NULL);
+            OCL_ASSERT(error == CL_SUCCESS);
+
+#if 0
+            printf("\nRead ########### Src buffer: \n");
+            for (int i = 0; i < 32; ++i)
+                printf(" %2.2u", main_buf_content[off + i]);
+
+            printf("\nRead ########### dst buffer: \n");
+            for (int i = 0; i < 32; ++i)
+                printf(" %2.2u", sub_buf_content[i]);
+            printf("\n");
+#endif
+            for (int i = 0; i < 32; ++i) {
+
+                if (main_buf_content[off + i] != sub_buf_content[i]) {
+                    printf ("different index is %d\n", i);
+                    OCL_ASSERT(0);
+                }
+            }
+
+        }
+    }
+
+
+    for (cl_ulong sz = 64; sz < max_alloc_size; sz*=4) {
+        for (cl_ulong off = 0; off < max_alloc_size; off += 1234) {
+            cl_buffer_region region;
+            region.origin = off;
+            region.size = sz;
+
+            sub_buf = clCreateSubBuffer(main_buf, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &error );
+
+            /* invalid size, should be failed. */
+            if(off + sz > max_alloc_size) {
+                OCL_ASSERT(error != CL_SUCCESS);
+                continue;
+            }
+            /* invalid align, should be failed. */
+            if(off & (address_align-1)) {
+                OCL_ASSERT(error != CL_SUCCESS);
+                continue;
+            }
+
+            OCL_ASSERT(error == CL_SUCCESS);
+
+            for (int i = 0; i < 32; i++) {
+                sub_buf_content[i] = rand() & 63;
+            }
+
+            error = clEnqueueWriteBuffer(queue, main_buf, CL_TRUE, off, 32, sub_buf_content, 0, NULL, NULL);
+            OCL_ASSERT(error == CL_SUCCESS);
+
+            void * mapped_ptr = clEnqueueMapBuffer(queue, sub_buf, CL_TRUE, (cl_map_flags)( CL_MAP_READ | CL_MAP_WRITE ),
+                    0, 32, 0, NULL, NULL, &error );
+            OCL_ASSERT(error == CL_SUCCESS);
+
+#if 0
+            printf("\nMap ########### Src buffer: \n");
+            for (int i = 0; i < 32; ++i)
+                printf(" %2.2u", sub_buf_content[i]);
+
+            printf("\nMap ########### dst buffer: \n");
+            for (int i = 0; i < 32; ++i)
+                printf(" %2.2u", ((char *)mapped_ptr)[i]);
+            printf("\n");
+#endif
+            for (int i = 0; i < 32; i++) {
+
+                if (((char *)mapped_ptr)[i] != sub_buf_content[i]) {
+                    printf ("different index is %d\n", i);
+                    OCL_ASSERT(0);
+                }
+            }
+
+            error = clEnqueueUnmapMemObject(queue, sub_buf, mapped_ptr, 0, NULL, NULL );
+            OCL_ASSERT(error == CL_SUCCESS);
+
+            clReleaseMemObject(sub_buf);
+        }
+    }
+
+
+    free(main_buf_content);
+}
+
+MAKE_UTEST_FROM_FUNCTION(sub_bufffer_check);
diff --git a/utests/utest.cpp b/utests/utest.cpp
index 18d10e8..718916f 100644
--- a/utests/utest.cpp
+++ b/utests/utest.cpp
@@ -32,7 +32,8 @@ using namespace std;
 vector<UTest> *UTest::utestList = NULL;
 void releaseUTestList(void) { delete UTest::utestList; }
 
-UTest::UTest(Function fn, const char *name, bool haveIssue) : fn(fn), name(name), haveIssue(haveIssue) {
+UTest::UTest(Function fn, const char *name, bool haveIssue, bool needDestroyProgram)
+       : fn(fn), name(name), haveIssue(haveIssue), needDestroyProgram(needDestroyProgram) {
   if (utestList == NULL) {
     utestList = new vector<UTest>;
     atexit(releaseUTestList);
@@ -40,8 +41,6 @@ UTest::UTest(Function fn, const char *name, bool haveIssue) : fn(fn), name(name)
   utestList->push_back(*this);
 }
 
-UTest::UTest(void) : fn(NULL), name(NULL), haveIssue(false) {}
-
 static bool strequal(const char *s1, const char *s2) {
   if (strcmp(s1, s2) == 0) return true;
   return false;
@@ -57,7 +56,7 @@ void UTest::run(const char *name) {
       std::cout << utest.name << ":" << std::endl;
       (utest.fn)();
       std::cout << std::endl;
-      cl_kernel_destroy();
+      cl_kernel_destroy(true);
       cl_buffer_destroy();
     }
   }
@@ -71,7 +70,7 @@ void UTest::runAll(void) {
     std::cout << utest.name << ":" << std::endl;
     (utest.fn)();
     std::cout << std::endl;
-    cl_kernel_destroy();
+    cl_kernel_destroy(utest.needDestroyProgram);
     cl_buffer_destroy();
   }
 }
@@ -84,7 +83,7 @@ void UTest::runAllNoIssue(void) {
     std::cout << utest.name << ":" << std::endl;
     (utest.fn)();
     std::cout << std::endl;
-    cl_kernel_destroy();
+    cl_kernel_destroy(utest.needDestroyProgram);
     cl_buffer_destroy();
   }
 }
diff --git a/utests/utest.hpp b/utests/utest.hpp
index d3a6a6f..01d4a8c 100644
--- a/utests/utest.hpp
+++ b/utests/utest.hpp
@@ -39,13 +39,15 @@ struct UTest
   /*! Empty test */
   UTest(void);
   /*! Build a new unit test and append it to the unit test list */
-  UTest(Function fn, const char *name, bool haveIssue = false);
+  UTest(Function fn, const char *name, bool haveIssue = false, bool needDestroyProgram = true);
   /*! Function to execute */
   Function fn;
   /*! Name of the test */
   const char *name;
   /*! Indicate whether current test cases has issue to be fixes */
   bool haveIssue;
+  /*! Indicate whether destroy kernels/program. */
+  bool needDestroyProgram;
   /*! The tests that are registered */
   static std::vector<UTest> *utestList;
   /*! Run the test with the given name */
@@ -61,6 +63,11 @@ struct UTest
 /*! Register a new unit test */
 #define UTEST_REGISTER(FN) static const UTest __##FN##__(FN, #FN);
 
+#define MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(FN, KEEP_PROGRAM) \
+  static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
+  static const UTest __##FN##__(__ANON__##FN##__, #FN, false, !(KEEP_PROGRAM));
+
+
 /*! Turn a function into a unit test */
 #define MAKE_UTEST_FROM_FUNCTION(FN) \
   static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
diff --git a/utests/utest_file_map.cpp b/utests/utest_file_map.cpp
index da3361c..55b7771 100644
--- a/utests/utest_file_map.cpp
+++ b/utests/utest_file_map.cpp
@@ -88,7 +88,7 @@ cl_file_map_open(cl_file_map_t *fm, const char *name)
 
   /* Open the file */
   fm->fd = open(name, O_RDONLY);
-  if(fm->fd <= 0) {
+  if(fm->fd < 0) {
     err = CL_FILE_MAP_FILE_NOT_FOUND;
     goto error;
   }
diff --git a/utests/utest_generator.py b/utests/utest_generator.py
new file mode 100644
index 0000000..626ac96
--- /dev/null
+++ b/utests/utest_generator.py
@@ -0,0 +1,374 @@
+#!/usr/bin/python
+import os,sys,re
+
+FLT_MAX_POSI='0x1.fffffep127f'
+FLT_MIN_NEGA='-0x1.fffffep127f'
+FLT_MIN_POSI='0x1.0p-126f'
+FLT_MAX_NEGA='-0x1.0p-126f'
+
+paraTypeList={'float':'%.20f','int':'%d','double':'%lf','uint':'%d','string':'%s'}
+
+
+def ulpUnit(ulpSize):
+  return re.findall(r"([a-zA-Z_]+)",ulpSize)[0]
+
+def ulpNum(ulpSize):
+  return re.findall(r"([0-9]+)",ulpSize)[0]
+
+def udebug(ulpSize,returnType):
+  #ulpUnit=re.findall(r"([a-zA-Z_]+)",ulpSize)[0]
+  #ulpNum=re.findall(r"([0-9]+)",ulpSize)[0]
+  text='''
+    static const char* INFORNAN;
+    static %s ULPSIZE;
+    
+    if (isinf(cpu_data[index])){
+      INFORNAN="INF";
+    }
+    else if (isnan(cpu_data[index])){
+      INFORNAN="NAN";
+    }
+    else{
+      ULPSIZE=cl_%s(cpu_data[index]) * %s;
+    }
+
+#if udebug 
+    if (isinf(cpu_data[index])){ 
+      if (isinf(gpu_data[index]))
+        printf("%s expect:%s\\n", log, INFORNAN);
+      else
+        printf_c("%s expect:%s\\n", log, INFORNAN);
+      }
+    else if (isnan(cpu_data[index])){
+      if (isnan(gpu_data[index]))
+        printf("%s expect:%s\\n", log, INFORNAN);
+      else
+        printf_c("%s expect:%s\\n", log, INFORNAN);
+      }
+    else if (diff <= ULPSIZE){
+      printf("%s expect:%s\\n", log, ULPSIZE);
+      }
+    else
+      printf_c("%s expect:%s\\n", log, ULPSIZE);
+#else
+    if (isinf(cpu_data[index])){
+      sprintf(log, "%s expect:%s\\n", log, INFORNAN);
+      OCL_ASSERTM(isinf(gpu_data[index]),log);
+      }
+    else if (isnan(cpu_data[index])){
+      sprintf(log, "%s expect:%s\\n", log, INFORNAN);
+      OCL_ASSERTM(isnan(gpu_data[index]),log);
+      }
+    else{
+      sprintf(log, "%s expect:%s\\n", log, ULPSIZE);
+      OCL_ASSERTM(fabs(gpu_data[index]-cpu_data[index]) <= ULPSIZE, log);
+      }
+#endif
+  }
+}\n'''%(returnType,\
+        ulpUnit(ulpSize),ulpNum(ulpSize),\
+        paraTypeList['string'],paraTypeList['string'],\
+        paraTypeList['string'],paraTypeList['string'],\
+        paraTypeList['string'],paraTypeList['string'],\
+        paraTypeList['string'],paraTypeList['string'],\
+        paraTypeList['string'],paraTypeList['%s'%(returnType)],\
+        paraTypeList['string'],paraTypeList['%s'%(returnType)],\
+        paraTypeList['string'],paraTypeList['string'],\
+        paraTypeList['string'],paraTypeList['string'],\
+        paraTypeList['string'],paraTypeList['%s'%(returnType)])
+
+  return text
+
+def gene2ValuesLoop(values1,values2,inputValues):
+  values2=values2+inputValues*len(inputValues)
+
+  for i in inputValues:
+    for j in range(0,len(inputValues)):
+      values1 += [i]
+
+  return values1,values2
+
+def gene3ValuesLoop(values1,values2,values3,inputValues):
+  for i in inputValues:
+    for j in range(0,len(inputValues)):
+      for k in range(0,len(inputValues)):
+        values1 += [i]
+
+  for i in inputValues:
+    for j in inputValues:
+      for k in range(0,len(inputValues)):
+        values2 += [j]
+
+  values3=inputValues*(len(inputValues)**2)
+  return values1,values2,values3
+
+class func:
+  """ This class will define all needed instance attribute in fundation a c programing file. """
+
+  def __init__(self,name,cpuFuncName,inputType,outputType,values,ulp, cpu_func=''):
+    self.funcName = name
+    self.cpuFuncName = cpuFuncName
+    self.fileName = 'builtin_'+name
+    self.inputtype = inputType
+    self.outputtype = outputType
+    self.values = values
+    self.ulp = ulp
+    self.cpufunc=cpu_func
+    self.cpplines = []
+    
+#####cpp file required information:
+    self.Head='''/*
+This file is generated by utest_generator.py.
+Usually you need NOT modify this file manually.
+But when any bug occured, you can change the value of udebug from 0 to 1,
+which can print more values and information to assist debuging the issue.
+*/
+
+#include "utest_helper.hpp"
+#include <stdio.h>
+#include <math.h>
+#include <algorithm>
+
+#define udebug 0
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define INT_ULP 0
+
+#define printf_c(...) \\
+{\\
+  printf("\\033[1m\\033[40;31m");\\
+  printf( __VA_ARGS__ );\\
+  printf("\\033[0m");\\
+}
+'''
+    #########Execute class itself
+    self.geneProcess()
+
+#####Computer vector && argument type:
+  def argtype(self,paraN,index):
+    return re.findall(r"[a-zA-Z_]+",self.inputtype[paraN][index])[0]
+
+  def argvector(self,paraN,index):
+    vector=re.findall(r"[0-9]+",self.inputtype[paraN][index])
+    if vector:
+      vector=vector[0]
+    else:
+      vector=1
+    return vector
+
+  def returnVector(self,index):
+    returnVector=re.findall(r"[0-9]+",self.outputtype[index])
+    if returnVector:
+      returnVector=returnVector[0]
+    else:
+      returnVector=1
+    return returnVector
+
+  def retType(self,index):
+    return re.findall("[a-zA-Z_]+",self.outputtype[index])[0]
+
+  def inputNumFormat(self,paraN,index):
+    return paraTypeList['%s'%(self.argtype(paraN,index))]
+
+  def outputNumFormat(self,index):
+    return paraTypeList['%s'%(self.retType(index))]
+
+#####Cpu values analyse
+  def GenInputValues(self,index):
+    #namesuffix=self.inputtype[0][index]
+    for i in range(0,self.values.__len__()):
+      self.cpplines += [ "const %s input_data%d[] = {%s};" %(self.argtype(i,index),i+1,str(self.values[i]).strip('[]').replace('\'','')) ]
+    self.cpplines += [ "const int count_input = sizeof(input_data1) / sizeof(input_data1[0]);" ]
+    self.cpplines += [ "const int vector = %s;\n"%(self.argvector(self.inputtype.__len__()-1,index)) ]
+
+#####Cpu Function
+  def GenCpuCompilerMath(self,index):
+    #namesuffix=self.inputtype[0][index]
+    defline='static void cpu_compiler_math(%s *dst, '%(self.retType(index))
+    cpufunargs='('
+    funcline = ['{']
+    vectorargs=[]
+
+    if (self.returnVector(index) == 1 and self.argvector(0,index) != 1):
+      for i in range(0,self.values.__len__()):
+        defline += 'const %s *src%d'%(self.argtype(i,index),i+1)
+        defline += ( i == self.values.__len__()-1 ) and ')' or ','
+        vectorargs.append('(')
+      for i in range(0,self.values.__len__()):
+        for j in range(0,self.vector):
+          vectorargs += "x%d%d"%(i+1,j+1)
+          vectorargs += ( j == self.vector-1 ) and ');' or ','
+          funcline += ["  const %s x%d%d = *(src%d+%d);"%(self.argtype(i,index),i+1,j+1,i+1,j)]
+
+      return 0
+
+    for i in range(0,self.values.__len__()):
+      defline += 'const %s *src%d'%(self.argtype(i,index),i+1)
+      defline += ( i == self.values.__len__()-1 ) and ')' or ','
+      cpufunargs += "x%d"%(i+1)
+      cpufunargs += ( i == self.values.__len__()-1 ) and ');' or ','
+      funcline += ["  const %s x%d = *src%d;"%(self.argtype(i,index),i+1,i+1)]
+
+    funcline += [ "  dst[0] = %s%s"%(self.cpuFuncName, cpufunargs) ]
+    funcline += [ '}'] 
+
+    funcline = [defline] + funcline
+
+    self.cpplines += funcline
+#    self.writeCPP( '\n'.join(funcline), 'a', namesuffix)
+
+  def writeCPP(self,content,authority,namesuffix):
+    file_object = open("generated/%s_%s.cpp"%(self.fileName,namesuffix),authority)
+    file_object.writelines(content)
+    file_object.close()
+
+  def writeCL(self,content,authority,namesuffix):
+    file_object = open(os.getcwd()+"/../kernels/%s_%s.cl"%(self.fileName,namesuffix),authority)
+    file_object.writelines(content)
+    file_object.close()
+
+  def nameForCmake(self,content,namesuffix):
+    print("generated/%s_%s.cpp"%(self.fileName,namesuffix)),
+
+  def utestFunc(self,index):
+    funcLines=[]
+    namesuffix=self.inputtype[0][index]
+    funcline=[]
+    funchead='''
+static void %s_%s(void)
+{
+  int index;
+  %s gpu_data[count_input] = {0}, cpu_data[count_input] = {0}, diff=0.0;
+  char log[1024] = {0};
+
+  OCL_CREATE_KERNEL(\"%s_%s\");
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * sizeof(%s), NULL); 
+
+  globals[0] = count_input;
+  locals[0] = 1;
+ '''%(self.fileName,namesuffix,\
+     self.retType(index),\
+     self.fileName, namesuffix,\
+     self.retType(index))
+
+    funcline += [funchead]
+    for i in range(1,self.values.__len__()+1): 
+      funcline += ["  OCL_CREATE_BUFFER(buf[%d], CL_MEM_READ_WRITE, count_input * sizeof(%s), NULL);"%(i,self.argtype(i-1,index))]
+      funcline += ["  clEnqueueWriteBuffer( queue, buf[%d], CL_TRUE, 0, count_input * sizeof(%s), input_data%d, 0, NULL, NULL);"%(i,self.argtype(i-1,index),i)]
+
+    funcline += ["  OCL_CREATE_BUFFER(buf[%d], CL_MEM_READ_WRITE, sizeof(int), NULL);"%(self.inputtype.__len__()+1)]
+    funcline += ["  clEnqueueWriteBuffer( queue, buf[%d], CL_TRUE, 0, sizeof(int), &vector, 0, NULL, NULL);"%(self.inputtype.__len__()+1)]
+
+	#0=output 1=input1 2=input2 ... len+2=output
+    for i in range(0,self.values.__len__()+2): 
+      funcline += ["  OCL_SET_ARG(%d, sizeof(cl_mem), &buf[%d]);"%(i,i)]
+
+    funcrun='''
+  // Run the kernel:
+  OCL_NDRANGE( 1 );
+  clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(%s) * count_input, gpu_data, 0, NULL, NULL);
+'''%(self.inputtype.__len__()+1)
+    funcline += [ funcrun ]
+
+    funcsprintfa='    sprintf(log, \"'
+    funcsprintfb=''
+    if (self.returnVector(index) == 1 and self.argvector(0,index) != 1):
+      funccompare='''
+  for (index = 0; index < count_input/vector; index++)
+  {
+    cpu_compiler_math( cpu_data + index, '''
+    else:
+      funccompare='''
+  for (index = 0; index < count_input; index++)
+  {
+    cpu_compiler_math( cpu_data + index,'''
+
+    for i in range(0,self.values.__len__()):
+      funccompare += " input_data%d + index"%(i+1)
+      funccompare += (self.values.__len__() - 1 == i) and ');' or ','
+
+      funcsprintfa += "input_data%d:"%(i+1)
+      funcsprintfa += "%s "%(self.inputNumFormat(i,index))
+      funcsprintfb += " input_data%d[index],"%(i+1)
+
+    funcline += [ funccompare ]
+
+    funcsprintfa += " -> gpu:%s  cpu:%s diff:%s\","%(self.outputNumFormat(index),self.outputNumFormat(index),self.outputNumFormat(index))#,self.outputNumFormat(index))
+    funcsprintfb += " gpu_data[index], cpu_data[index], diff);"#%(ulpUnit(self.ulp),ulpNum(self.ulp))
+
+    #funcdiff = "    diff = fabs((gpu_data[index]-cpu_data[index])"
+    #funcdiff += (self.retType(index) == "int") and ');' or '/(cpu_data[index]>1?cpu_data[index]:1));'
+    funcdiff = "    diff = fabs((gpu_data[index]-cpu_data[index]));"
+    funcline += [ funcdiff ]
+    funcline += [ funcsprintfa + funcsprintfb ]
+
+    self.cpplines += funcline
+
+    self.cpplines += [ udebug(self.ulp,self.retType(index)) ]
+    self.cpplines += [ "MAKE_UTEST_FROM_FUNCTION(%s_%s)"%(self.fileName,namesuffix) ]
+
+  def genCL(self,index):
+    namesuffix=self.inputtype[0][index]
+    clLine = []
+    clhead = '__kernel void %s_%s(__global %s *dst, '%(self.fileName,namesuffix,self.retType(index))
+    clvalueDef=''
+    clcomputer=''
+    tmp=''
+
+    for i in range(0,self.values.__len__()):
+      clhead += ' __global %s *src%d,'%(self.argtype(i,index),i+1)
+      clvalueDef +=   '  %s x%d = (%s) ('%(self.inputtype[i][index],i+1,self.inputtype[i][index])
+      tmp = 'src%d[i * (*vector) + '%(i+1)
+      for j in range(0,int(self.argvector(i,index))):
+        clvalueDef += tmp + ((int(self.argvector(i-1,index)) == j+1 ) and '%d]);\n'%(j) or '%d],'%(j))
+      clcomputer += (self.values.__len__() == i+1) and 'x%d);'%(i+1) or 'x%d,'%(i+1)
+      
+    clhead += ' __global int *vector) {\n'
+    clhead += '  int i = get_global_id(0);'
+    clLine += [ clhead ]
+    clLine += [ clvalueDef ]
+    clLine += [ '  %s ret;'%(self.outputtype[index]) ]
+    clLine += [ '  ret = %s('%(self.funcName) + clcomputer ] 
+
+    if (int(self.returnVector(index)) == 1):
+      clLine += [ '  dst[i] = ret;' ]
+    else:
+      for i in range(0,int(self.returnVector(index))):
+        clLine += [ '  dst[i * (*vector) + %d] = ret[%d];'%(i,i) ]
+    clLine += [ '};' ]
+
+    self.writeCL('\n'.join(clLine),'w',namesuffix)
+  
+  def geneProcess(self):
+    for i in range(0,self.inputtype[0].__len__()):
+##########Write Cpp file          
+      namesuffix=self.inputtype[0][i]
+      self.cpplines = []
+      #The head:
+      self.cpplines += [self.Head]
+
+      #Parameters:
+      self.GenInputValues(i)
+
+      #cpu function generator:
+      self.cpplines += [self.cpufunc]
+
+      #Cpu function:
+      self.GenCpuCompilerMath(i)
+
+      #utest function
+      self.utestFunc(i)
+
+      #kernel cl
+      self.genCL(i)
+
+      #CMakelists.txt
+      self.nameForCmake(self.fileName,namesuffix)
+
+      self.writeCPP( '\n'.join(self.cpplines) ,'w',namesuffix)
+#########End
+
+#def main():
+#
+#if __name__ == "__main__":
+#  main()
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 8089799..91633f0 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -237,32 +237,38 @@ cl_kernel_init(const char *file_name, const char *kernel_name, int format, const
   cl_file_map_t *fm = NULL;
   char *ker_path = NULL;
   cl_int status = CL_SUCCESS;
+  static const char *prevFileName = NULL;
 
   /* Load the program and build it */
-  ker_path = cl_do_kiss_path(file_name, device);
-  if (format == LLVM)
-    program = clCreateProgramWithLLVMIntel(ctx, 1, &device, ker_path, &status);
-  else if (format == SOURCE) {
-    cl_file_map_t *fm = cl_file_map_new();
-    FATAL_IF (cl_file_map_open(fm, ker_path) != CL_FILE_MAP_SUCCESS,
-              "Failed to open file \"%s\" with kernel \"%s\". Did you properly set OCL_KERNEL_PATH variable?",
-              file_name, kernel_name);
-    const char *src = cl_file_map_begin(fm);
-    const size_t sz = cl_file_map_size(fm);
-    program = clCreateProgramWithSource(ctx, 1, &src, &sz, &status);
-    cl_file_map_delete(fm);
-  } else
-    FATAL("Not able to create program from binary");
-
-  if (status != CL_SUCCESS) {
-    fprintf(stderr, "error calling clCreateProgramWithBinary\n");
-    goto error;
+  if (!program || (program && (!prevFileName || strcmp(prevFileName, file_name)))) {
+    if (program) clReleaseProgram(program);
+    ker_path = cl_do_kiss_path(file_name, device);
+    if (format == LLVM)
+      program = clCreateProgramWithLLVMIntel(ctx, 1, &device, ker_path, &status);
+    else if (format == SOURCE) {
+      cl_file_map_t *fm = cl_file_map_new();
+      FATAL_IF (cl_file_map_open(fm, ker_path) != CL_FILE_MAP_SUCCESS,
+                "Failed to open file \"%s\" with kernel \"%s\". Did you properly set OCL_KERNEL_PATH variable?",
+                file_name, kernel_name);
+      const char *src = cl_file_map_begin(fm);
+      const size_t sz = cl_file_map_size(fm);
+      program = clCreateProgramWithSource(ctx, 1, &src, &sz, &status);
+      cl_file_map_delete(fm);
+    } else
+      FATAL("Not able to create program from binary");
+
+    if (status != CL_SUCCESS) {
+      fprintf(stderr, "error calling clCreateProgramWithBinary\n");
+      goto error;
+    }
+    prevFileName = file_name;
   }
-
   /* OCL requires to build the program even if it is created from a binary */
   OCL_CALL (clBuildProgram, program, 1, &device, build_opt, NULL, NULL);
 
   /* Create a kernel from the program */
+  if (kernel)
+    clReleaseKernel(kernel);
   kernel = clCreateKernel(program, kernel_name, &status);
   if (status != CL_SUCCESS) {
     fprintf(stderr, "error calling clCreateKernel\n");
@@ -274,6 +280,7 @@ exit:
   cl_file_map_delete(fm);
   return status;
 error:
+  prevFileName = NULL;
   goto exit;
 }
 
@@ -409,12 +416,16 @@ error:
 }
 
 void
-cl_kernel_destroy(void)
+cl_kernel_destroy(bool needDestroyProgram)
 {
-  if (kernel) clReleaseKernel(kernel);
-  if (program) clReleaseProgram(program);
-  kernel = NULL;
-  program = NULL;
+  if (kernel) {
+    clReleaseKernel(kernel);
+    kernel = NULL;
+  }
+  if (needDestroyProgram && program) {
+    clReleaseProgram(program);
+    program = NULL;
+  }
 }
 
 void
@@ -637,3 +648,34 @@ int cl_check_image(const int *img, int w, int h, const char *bmp)
   return (float(discrepancy) / float(n) > max_error_ratio) ? 0 : 1;
 }
 
+typedef struct
+{
+  unsigned int mantissa:23;
+  unsigned int exponent:8;
+  unsigned int sign:1;
+} FLOAT;
+
+typedef union
+{
+  float f;
+  unsigned int i;
+  FLOAT spliter;
+} SF;
+
+const float cl_FLT_ULP(float float_number)
+{
+  SF floatBin, ulpBin, ulpBinBase;
+  floatBin.f = float_number;
+
+  ulpBin.spliter.sign     = ulpBinBase.spliter.sign     = 0;
+  ulpBin.spliter.exponent = ulpBinBase.spliter.exponent = floatBin.spliter.exponent;
+  ulpBin.spliter.mantissa = 0x1;
+  ulpBinBase.spliter.mantissa = 0x0;
+  
+  return ulpBin.f - ulpBinBase.f;
+}
+
+const int cl_INT_ULP(int int_number)
+{
+  return 0;
+}
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 29a21d5..0937bf2 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -65,6 +65,11 @@ extern EGLSurface  eglSurface;
     OCL_CALL (cl_kernel_init, NAME".cl", NAME, SOURCE, NULL); \
   } while (0)
 
+#define OCL_DESTROY_KERNEL_KEEP_PROGRAM(KEEP_PROGRAM) \
+  do { \
+    cl_kernel_destroy(!(KEEP_PROGRAM)); \
+  } while(0)
+
 #define OCL_CREATE_KERNEL_FROM_FILE(FILE_NAME, KERNEL_NAME) \
   do { \
     OCL_CALL(cl_kernel_init, FILE_NAME".cl", KERNEL_NAME, SOURCE, NULL); \
@@ -199,7 +204,7 @@ extern void cl_buffer_destroy(void);
 extern void cl_ocl_destroy(void);
 
 /* Release kernel and program */
-extern void cl_kernel_destroy(void);
+extern void cl_kernel_destroy(bool needDestroyProgram = true);
 
 /* Release everything allocated in cl_test_init */
 extern void cl_test_destroy(void);
@@ -216,5 +221,11 @@ extern void cl_write_bmp(const int *data, int width, int height, const char *fil
 /* Check data from img against bmp file located at "bmp" */
 extern int cl_check_image(const int *img, int w, int h, const char *bmp);
 
+/* Calculator ULP of each FLOAT value */
+extern const float cl_FLT_ULP(float float_number);
+
+/* Calculator ULP of each INT value */
+extern const int cl_INT_ULP(int int_number);
+
 #endif /* __UTEST_HELPER_HPP__ */
 
diff --git a/utests/utest_math_gen.py b/utests/utest_math_gen.py
new file mode 100755
index 0000000..f268739
--- /dev/null
+++ b/utests/utest_math_gen.py
@@ -0,0 +1,519 @@
+#!/usr/bin/python
+from utest_generator import *
+import os,sys
+
+#base_input_values = [80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24]
+#extend_input_values = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24]
+
+#func:
+#    gpufuncName 
+#    cpuFuncName
+#    fileName: 'builtin_'+name
+#    inputtype: a 2-D list because there're more than one input data
+#    outputtype: a list
+#    values
+#    ulp
+
+base_input_values = [ 0, 1, 3.14]
+def main():
+  ##### gentype acos(gentype)
+  acos_input_values = base_input_values
+  acos_input_type = ['float','float2','float4','float8','float16']
+  acos_output_type = ['float','float2','float4','float8','float16']
+  acosUtests = func('acos','acos',[acos_input_type],acos_output_type,[acos_input_values],'4 * FLT_ULP')
+  
+  ##### gentype acosh(gentype)
+  acosh_input_values = base_input_values
+  acosh_input_type = ['float','float2','float4','float8','float16']
+  acosh_output_type = ['float','float2','float4','float8','float16']
+  acoshUtests = func('acosh','acosh',[acosh_input_type],acosh_output_type,[acosh_input_values],'4 * FLT_ULP')
+  
+  ##### gentype acospi(gentype x)
+  acospi_input_values = base_input_values
+  acospi_input_type = ['float','float2','float4','float8','float16']
+  acospi_output_type = ['float','float2','float4','float8','float16']
+  acospi_cpu_func='''
+static float acospi(float x){
+  return acos(x)/M_PI;
+} '''
+  acospiUtests = func('acospi','acospi',[acospi_input_type],acospi_output_type,[acospi_input_values],'4 * FLT_ULP',acospi_cpu_func)
+  
+  ##### gentype asin(gentype)
+  asin_input_values = base_input_values
+  asin_input_type = ['float','float2','float4','float8','float16']
+  asin_output_type = ['float','float2','float4','float8','float16']
+  asinUtests = func('asin','asin',[asin_input_type],asin_output_type,[asin_input_values],'4 * FLT_ULP')
+  
+  ##### gentype asinh(gentype)
+  asinh_input_values = base_input_values
+  asinh_input_type = ['float','float2','float4','float8','float16']
+  asinh_output_type = ['float','float2','float4','float8','float16']
+  asinhUtests = func('asinh','asinh',[asinh_input_type],asinh_output_type,[asinh_input_values],'4 * FLT_ULP')
+  
+  ##### gentype asinpi(gentype x)
+  asinpi_input_values = base_input_values
+  asinpi_input_type = ['float','float2','float4','float8','float16']
+  asinpi_output_type = ['float','float2','float4','float8','float16']
+  asinpi_cpu_func='''
+static float asinpi(float x){
+  return asin(x)/M_PI;
+} '''
+  asinpiUtests = func('asinpi','asinpi',[asinpi_input_type],asinpi_output_type,[asinpi_input_values],'4 * FLT_ULP',asinpi_cpu_func)
+  
+  ##### gentype atan(gentype y_over_x)
+  atan_input_values = base_input_values
+  atan_input_type = ['float','float2','float4','float8','float16']
+  atan_output_type = ['float','float2','float4','float8','float16']
+  atanUtests = func('atan','atan',[atan_input_type],atan_output_type,[atan_input_values],'5 * FLT_ULP')
+  
+  ##### gentype atan2(gentype y, gentype x)
+  atan2_base_values = base_input_values
+  atan2_input_values1 = []
+  atan2_input_values2 = []
+  atan2_input_values1,atan2_input_values2=gene2ValuesLoop(atan2_input_values1,atan2_input_values2,atan2_base_values)
+  atan2_input_type1 = ['float','float2','float4','float8','float16']
+  atan2_input_type2 = ['float','float2','float4','float8','float16']
+  atan2_output_type = ['float','float2','float4','float8','float16']
+  atan2Utests = func('atan2','atan2',[atan2_input_type1,atan2_input_type2],atan2_output_type,[atan2_input_values1,atan2_input_values2],'6 * FLT_ULP')
+  
+  ##### gentype atanh(gentype)
+  atanh_input_values = base_input_values
+  atanh_input_type = ['float','float2','float4','float8','float16']
+  atanh_output_type = ['float','float2','float4','float8','float16']
+  atanhUtests = func('atanh','atanh',[atanh_input_type],atanh_output_type,[atanh_input_values],'5 * FLT_ULP')
+  
+  ##### gentype atanpi(gentype x)
+  atanpi_input_values = base_input_values
+  atanpi_input_type = ['float','float2','float4','float8','float16']
+  atanpi_output_type = ['float','float2','float4','float8','float16']
+  atanpi_cpu_func='''
+static float atanpi(float x){
+  return atan(x)/M_PI;
+} '''
+  atanpiUtests = func('atanpi','atanpi',[atanpi_input_type],atanpi_output_type,[atanpi_input_values],'4 * FLT_ULP',atanpi_cpu_func)
+  
+#  ##### gentype atan2pi(gentype y, gentype x)
+#  atan2pi_base_values = base_input_values
+#  atan2pi_input_values1 = []
+#  atan2pi_input_values2 = []
+#  atan2pi_input_values1,atan2pi_input_values2=gene2ValuesLoop(atan2pi_input_values1,atan2pi_input_values2,atan2pi_base_values)
+#  atan2pi_input_type1 = ['float','float2','float4','float8','float16']
+#  atan2pi_input_type2 = ['float','float2','float4','float8','float16']
+#  atan2pi_output_type = ['float','float2','float4','float8','float16']
+#  atan2pi_cpu_func='''
+#static float atan2pi(float y, float x){
+#  return atan2(y,x)/M_PI;
+#} '''
+#  atan2piUtests = func('atan2pi','atan2pi',[atan2pi_input_type1,atan2pi_input_type2],atan2pi_output_type,[atan2pi_input_values1,atan2pi_input_values2],'6 * FLT_ULP',atan2pi_cpu_func)
+  
+  ##### gentype cbrt(gentype)
+  cbrt_input_values = base_input_values
+  cbrt_input_type = ['float','float2','float4','float8','float16']
+  cbrt_output_type = ['float','float2','float4','float8','float16']
+  cbrtUtests = func('cbrt','cbrt',[cbrt_input_type],cbrt_output_type,[cbrt_input_values],'4 * FLT_ULP')
+  
+  ##### gentype ceil(gentype)
+  ceil_input_values = base_input_values
+  ceil_input_type = ['float','float2','float4','float8','float16']
+  ceil_output_type = ['float','float2','float4','float8','float16']
+  ceilUtests = func('ceil','ceil',[ceil_input_type],ceil_output_type,[ceil_input_values],'0 * FLT_ULP')
+  
+  ##### gentype copysign(gentype x, gentype y)
+  copysign_base_values = base_input_values
+  copysign_input_values1 = []
+  copysign_input_values2 = []
+  copysign_input_values1,copysign_input_values2=gene2ValuesLoop(copysign_input_values1,copysign_input_values2,copysign_base_values)
+  copysign_input_type1 = ['float','float2','float4','float8','float16']
+  copysign_input_type2 = ['float','float2','float4','float8','float16']
+  copysign_output_type = ['float','float2','float4','float8','float16']
+  copysignUtests = func('copysign','copysign',[copysign_input_type1,copysign_input_type2],copysign_output_type,[copysign_input_values1,copysign_input_values2],'0 * FLT_ULP')
+  
+  ##### gentype cos(gentype)
+  cos_input_values = base_input_values
+  cos_input_type = ['float','float2','float4','float8','float16']
+  cos_output_type = ['float','float2','float4','float8','float16']
+  cosUtests = func('cos','cos',[cos_input_type],cos_output_type,[cos_input_values],'4 * FLT_ULP')
+  
+  ##### gentype cosh(gentype)
+  cosh_input_values = base_input_values
+  cosh_input_type = ['float','float2','float4','float8','float16']
+  cosh_output_type = ['float','float2','float4','float8','float16']
+  coshUtests = func('cosh','cosh',[cosh_input_type],cosh_output_type,[cosh_input_values],'4 * FLT_ULP')
+  
+  ##### gentype cospi(gentype x)
+  cospi_input_values = base_input_values
+  cospi_input_type = ['float','float2','float4','float8','float16']
+  cospi_output_type = ['float','float2','float4','float8','float16']
+  cospi_cpu_func='''
+static float cospi(float x){
+  return cos(M_PI * x);
+} '''
+  cospiUtests = func('cospi','cospi',[cospi_input_type],cospi_output_type,[cospi_input_values],'2 * FLT_ULP',cospi_cpu_func)
+  
+  ##### gentype erf(gentype)
+  erf_input_values = base_input_values
+  erf_input_type = ['float','float2','float4','float8','float16']
+  erf_output_type = ['float','float2','float4','float8','float16']
+  erfUtests = func('erf','erf',[erf_input_type],erf_output_type,[erf_input_values],'16 * FLT_ULP')
+  
+  ##### gentype erfc(gentype)
+  erfc_input_values = base_input_values
+  erfc_input_type = ['float','float2','float4','float8','float16']
+  erfc_output_type = ['float','float2','float4','float8','float16']
+  erfcUtests = func('erfc','erfc',[erfc_input_type],erfc_output_type,[erfc_input_values],'16 * FLT_ULP')
+  
+  ##### gentype exp(gentype x)
+  exp_input_values = base_input_values
+  exp_input_type = ['float','float2','float4','float8','float16']
+  exp_output_type = ['float','float2','float4','float8','float16']
+  expUtests = func('exp','exp',[exp_input_type],exp_output_type,[exp_input_values],'4 * FLT_ULP')
+  
+  ##### gentype exp2(gentype)
+  exp2_input_values = base_input_values
+  exp2_input_type = ['float','float2','float4','float8','float16']
+  exp2_output_type = ['float','float2','float4','float8','float16']
+  exp2Utests = func('exp2','exp2',[exp2_input_type],exp2_output_type,[exp2_input_values],'4 * FLT_ULP')
+  
+  ##### gentype exp10(gentype)
+  exp10_input_values = base_input_values
+  exp10_input_type = ['float','float2','float4','float8','float16']
+  exp10_output_type = ['float','float2','float4','float8','float16']
+  exp10Utests = func('exp10','exp10',[exp10_input_type],exp10_output_type,[exp10_input_values],'4 * FLT_ULP')
+  
+  ##### gentype expm1(gentype x)
+  expm1_input_values = base_input_values
+  expm1_input_type = ['float','float2','float4','float8','float16']
+  expm1_output_type = ['float','float2','float4','float8','float16']
+  expm1Utests = func('expm1','expm1',[expm1_input_type],expm1_output_type,[expm1_input_values],'4 * FLT_ULP')
+  
+  ##### gentype fabs(gentype)
+  fabs_input_values = base_input_values
+  fabs_input_type = ['float','float2','float4','float8','float16']
+  fabs_output_type = ['float','float2','float4','float8','float16']
+  fabsUtests = func('fabs','fabs',[fabs_input_type],fabs_output_type,[fabs_input_values],'0 * FLT_ULP')
+  
+  ##### gentype fdim(gentype x, gentype y)
+  fdim_base_values = base_input_values
+  fdim_input_values1 = []
+  fdim_input_values2 = []
+  fdim_input_values1,fdim_input_values2=gene2ValuesLoop(fdim_input_values1,fdim_input_values2,fdim_base_values)
+  fdim_input_type1 = ['float','float2','float4','float8','float16']
+  fdim_input_type2 = ['float','float2','float4','float8','float16']
+  fdim_output_type = ['float','float2','float4','float8','float16']
+  fdimUtests = func('fdim','fdim',[fdim_input_type1,fdim_input_type2],fdim_output_type,[fdim_input_values1,fdim_input_values2],'0 * FLT_ULP')
+  
+  ##### gentype floor(gentype)
+  floor_input_values = base_input_values
+  floor_input_type = ['float','float2','float4','float8','float16']
+  floor_output_type = ['float','float2','float4','float8','float16']
+  floorUtests = func('floor','floor',[floor_input_type],floor_output_type,[floor_input_values],'0 * FLT_ULP')
+  
+  ##### gentype fmax(gentype x, gentype y)
+  fmax_base_values = base_input_values
+  fmax_input_values1 = []
+  fmax_input_values2 = []
+  fmax_input_values1,fmax_input_values2=gene2ValuesLoop(fmax_input_values1,fmax_input_values2,fmax_base_values)
+  fmax_input_type1 = ['float','float2','float4','float8','float16']
+  fmax_input_type2 = ['float','float2','float4','float8','float16']
+  fmax_output_type = ['float','float2','float4','float8','float16']
+  fmaxUtests = func('fmax','fmax',[fmax_input_type1,fmax_input_type2],fmax_output_type,[fmax_input_values1,fmax_input_values2],'0 * FLT_ULP')
+  
+  ##### gentypef fmax(gentypef x, float y)
+#  fmax_gentypef_base_values = base_input_values
+#  fmax_gentypef_input_values1 = []
+#  fmax_gentypef_input_values2 = []
+#  fmax_gentypef_input_values2,fmax_gentypef_input_values1=gene2ValuesLoop(fmax_gentypef_input_values1,fmax_gentypef_input_values2,fmax_gentypef_base_values)
+#  fmax_gentypef_input_type1 = ['float','float2','float4','float8','float16']
+#  fmax_gentypef_input_type2 = ['float','float','float','float','float']
+#  fmax_gentypef_output_type = ['float','float2','float4','float8','float16']
+#  ##### gentypef fmax(gentypef x, float y)
+#  fmax_gentypefUtests = func('gentypef_fmax','gentypef_fmax',[fmax_gentypef_input_type1,fmax_gentypef_input_type2],fmax_gentypef_output_type,[fmax_gentypef_input_values1,fmax_gentypef_input_values2],'0 * FLT_ULP')
+  
+  ##### gentype fmin(gentype x, gentype y)
+  fmin_base_values = base_input_values
+  fmin_input_values1 = []
+  fmin_input_values2 = []
+  fmin_input_values1,fmin_input_values2=gene2ValuesLoop(fmin_input_values1,fmin_input_values2,fmin_base_values)
+  fmin_input_type1 = ['float','float2','float4','float8','float16']
+  fmin_input_type2 = ['float','float2','float4','float8','float16']
+  fmin_output_type = ['float','float2','float4','float8','float16']
+  fminUtests = func('fmin','fmin',[fmin_input_type1,fmin_input_type2],fmin_output_type,[fmin_input_values1,fmin_input_values2],'0 * FLT_ULP')
+  
+#  ##### gentypef fmin(gentypef x, float y)
+#  fmin_gentypef_base_values = base_input_values
+#  fmin_gentypef_input_values1 = []
+#  fmin_gentypef_input_values2 = []
+#  fmin_gentypef_input_values2,fmin_gentypef_input_values1=gene2ValuesLoop(fmin_gentypef_input_values1,fmin_gentypef_input_values2,fmin_gentypef_base_values)
+#  fmin_gentypef_input_type1 = ['float','float2','float4','float8','float16']
+#  fmin_gentypef_input_type2 = ['float','float','float','float','float']
+#  fmin_gentypef_output_type = ['float','float2','float4','float8','float16']
+#  ##### gentypef fmin(gentypef x, float y)
+#  fmin_gentypefUtests = func('gentypef_fmin','gentypef_fmin',[fmin_gentypef_input_type1,fmin_gentypef_input_type2],fmin_gentypef_output_type,[fmin_gentypef_input_values1,fmin_gentypef_input_values2],'0 * FLT_ULP')
+#  
+  ##### gentype fmod(gentype x, gentype y)
+  fmod_base_values = base_input_values
+  fmod_input_values1 = []
+  fmod_input_values2 = []
+  fmod_input_values1,fmod_input_values2=gene2ValuesLoop(fmod_input_values1,fmod_input_values2,fmod_base_values)
+  fmod_input_type1 = ['float','float2','float4','float8','float16']
+  fmod_input_type2 = ['float','float2','float4','float8','float16']
+  fmod_output_type = ['float','float2','float4','float8','float16']
+  fmodUtests = func('fmod','fmod',[fmod_input_type1,fmod_input_type2],fmod_output_type,[fmod_input_values1,fmod_input_values2],'0 * FLT_ULP')
+  
+  ##### gentype hypot(gentype x, gentype y)
+  hypot_base_values = base_input_values
+  hypot_input_values1 = []
+  hypot_input_values2 = []
+  hypot_input_values1,hypot_input_values2=gene2ValuesLoop(hypot_input_values1,hypot_input_values2,hypot_base_values)
+  hypot_input_type1 = ['float','float2','float4','float8','float16']
+  hypot_input_type2 = ['float','float2','float4','float8','float16']
+  hypot_output_type = ['float','float2','float4','float8','float16']
+  hypotUtests = func('hypot','hypot',[hypot_input_type1,hypot_input_type2],hypot_output_type,[hypot_input_values1,hypot_input_values2],'4 * FLT_ULP')
+  
+  ##### intn ilogb(floartn x)
+  ilogb_input_values = base_input_values
+  ilogb_input_type = ['float','float2','float4','float8','float16']
+  ilogb_output_type = ['int','int2','int4','int8','int16']
+  ilogbUtests = func('ilogb','ilogb',[ilogb_input_type],ilogb_output_type,[ilogb_input_values],'0 * INT_ULP')
+
+  ##### gentype lgamma(gentype x)
+  lgamma_input_values = base_input_values
+  lgamma_input_type = ['float','float2','float4','float8','float16']
+  lgamma_output_type = ['float','float2','float4','float8','float16']
+  lgammaUtests = func('lgamma','lgamma',[lgamma_input_type],lgamma_output_type,[lgamma_input_values],'4 * FLT_ULP')
+
+  ##### gentype log(gentype)
+  log_input_values = base_input_values
+  log_input_type = ['float','float2','float4','float8','float16']
+  log_output_type = ['float','float2','float4','float8','float16']
+  logUtests = func('log','log',[log_input_type],log_output_type,[log_input_values],'4 * FLT_ULP')
+  
+  ##### gentype log2(gentype)
+  log2_input_values = base_input_values
+  log2_input_type = ['float','float2','float4','float8','float16']
+  log2_output_type = ['float','float2','float4','float8','float16']
+  log2Utests = func('log2','log2',[log2_input_type],log2_output_type,[log2_input_values],'4 * FLT_ULP')
+  
+  ##### gentype log10(gentype)
+  log10_input_values = base_input_values
+  log10_input_type = ['float','float2','float4','float8','float16']
+  log10_output_type = ['float','float2','float4','float8','float16']
+  log10Utests = func('log10','log10',[log10_input_type],log10_output_type,[log10_input_values],'4 * FLT_ULP')
+  
+  ##### gentype log1p(gentype x)
+  log1p_input_values = base_input_values
+  log1p_input_type = ['float','float2','float4','float8','float16']
+  log1p_output_type = ['float','float2','float4','float8','float16']
+  log1pUtests = func('log1p','log1p',[log1p_input_type],log1p_output_type,[log1p_input_values],'4 * FLT_ULP')
+  
+  ##### gentype logb(gentype x)
+  logb_input_values = base_input_values
+  logb_input_type = ['float','float2','float4','float8','float16']
+  logb_output_type = ['float','float2','float4','float8','float16']
+  logbUtests = func('logb','logb',[logb_input_type],logb_output_type,[logb_input_values],'0 * FLT_ULP')
+  
+  ##### gentype maxmag(gentype x, gentype y)
+  maxmag_base_values = base_input_values
+  maxmag_input_values1 = []
+  maxmag_input_values2 = []
+  maxmag_input_values1,maxmag_input_values2=gene2ValuesLoop(maxmag_input_values1,maxmag_input_values2,maxmag_base_values)
+  maxmag_input_type1 = ['float','float2','float4','float8','float16']
+  maxmag_input_type2 = ['float','float2','float4','float8','float16']
+  maxmag_output_type = ['float','float2','float4','float8','float16']
+  maxmag_cpu_func='''
+static float maxmag(float x, float y){
+  if(fabs(x) > fabs(y))
+    return x;
+  else if (fabs(x) < fabs(y))
+    return y;
+  else
+    return fmax(x,y);
+} '''
+  maxmagUtests = func('maxmag','maxmag',[maxmag_input_type1,maxmag_input_type2],maxmag_output_type,[maxmag_input_values1,maxmag_input_values2],'0 * FLT_ULP',maxmag_cpu_func)
+  
+  ##### gentype minmag(gentype x, gentype y)
+  minmag_base_values = base_input_values
+  minmag_input_values1 = []
+  minmag_input_values2 = []
+  minmag_input_values1,minmag_input_values2=gene2ValuesLoop(minmag_input_values1,minmag_input_values2,minmag_base_values)
+  minmag_input_type1 = ['float','float2','float4','float8','float16']
+  minmag_input_type2 = ['float','float2','float4','float8','float16']
+  minmag_output_type = ['float','float2','float4','float8','float16']
+  minmag_cpu_func='''
+static float minmag(float x, float y){
+  if(fabs(x) < fabs(y))
+    return x;
+  else if (fabs(x) > fabs(y))
+    return y;
+  else
+    return fmin(x,y);
+} '''
+  minmagUtests = func('minmag','minmag',[minmag_input_type1,minmag_input_type2],minmag_output_type,[minmag_input_values1,minmag_input_values2],'0 * FLT_ULP',minmag_cpu_func)
+  
+#  ##### floatn nan(uintn nancode)
+#  nan_input_values = base_input_values
+#  nan_input_type = ['uint','uint2','uint4','uint8','uint16']
+#  nan_output_type = ['float','float2','float4','float8','float16']
+#  nanUtests = func('nan','nan',[nan_input_type],nan_output_type,[nan_input_values],'0 * FLT_ULP')
+  
+  ##### gentype nextafter(gentype x, gentype y)
+  nextafter_base_values = base_input_values
+  nextafter_input_values1 = []
+  nextafter_input_values2 = []
+  nextafter_input_values1,nextafter_input_values2=gene2ValuesLoop(nextafter_input_values1,nextafter_input_values2,nextafter_base_values)
+  nextafter_input_type1 = ['float','float2','float4','float8','float16']
+  nextafter_input_type2 = ['float','float2','float4','float8','float16']
+  nextafter_output_type = ['float','float2','float4','float8','float16']
+  nextafterUtests = func('nextafter','nextafter',[nextafter_input_type1,nextafter_input_type2],nextafter_output_type,[nextafter_input_values1,nextafter_input_values2],'0 * FLT_ULP')
+  
+  ##### gentype pow(gentype x, gentype y)
+  pow_base_values = base_input_values
+  pow_input_values1 = []
+  pow_input_values2 = []
+  pow_input_values1,pow_input_values2=gene2ValuesLoop(pow_input_values1,pow_input_values2,pow_base_values)
+  pow_input_type1 = ['float','float2','float4','float8','float16']
+  pow_input_type2 = ['float','float2','float4','float8','float16']
+  pow_output_type = ['float','float2','float4','float8','float16']
+  powUtests = func('pow','pow',[pow_input_type1,pow_input_type2],pow_output_type,[pow_input_values1,pow_input_values2],'16 * FLT_ULP')
+  
+  ##### floatn pown(floatn x, intn y)
+  pown_input_values1 = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24]
+  pown_input_values2 = [-1,-2,-3,4,5,6,7,8,9,10,11,12,13,14,15,16,12]
+  pown_input_type1 = ['float','float2','float4','float8','float16']
+  pown_input_type2 = ['int','int2','int4','int8','int16']
+  pown_output_type = ['float','float2','float4','float8','float16']
+  pown_cpu_func='''
+static float pown(float x, int y){
+    return pow(x,y);
+} '''
+  pownUtests = func('pown','pown',[pown_input_type1,pown_input_type2],pown_output_type,[pown_input_values1,pown_input_values2],'16 * FLT_ULP', pown_cpu_func)
+  
+  ##### gentype powr(gentype x, gentype y)
+  powr_input_values1 = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24]
+  powr_input_values2 = [1,2,3.14,4,5,6,7,8,9.889,10,11,12,13,14.33,15,0,12]
+  powr_input_type1 = ['float','float2','float4','float8','float16']
+  powr_input_type2 = ['float','float2','float4','float8','float16']
+  powr_output_type = ['float','float2','float4','float8','float16']
+  powr_cpu_func='''
+static float powr(float x, int y){
+    return pow(x,y);
+} '''
+  powrUtests = func('powr','powr',[powr_input_type1,powr_input_type2],powr_output_type,[powr_input_values1,powr_input_values2],'16 * FLT_ULP', powr_cpu_func)
+  
+  ##### gentype remainder(gentype x, gentype y)
+  remainder_base_values = base_input_values
+  remainder_input_values1 = []
+  remainder_input_values2 = []
+  remainder_input_values1,remainder_input_values2=gene2ValuesLoop(remainder_input_values1,remainder_input_values2,remainder_base_values)
+  remainder_input_type1 = ['float','float2','float4','float8','float16']
+  remainder_input_type2 = ['float','float2','float4','float8','float16']
+  remainder_output_type = ['float','float2','float4','float8','float16']
+  remainderUtests = func('remainder','remainder',[remainder_input_type1,remainder_input_type2],remainder_output_type,[remainder_input_values1,remainder_input_values2],'0 * FLT_ULP')
+  
+  ##### gentype rint(gentype x)
+  rint_input_values = base_input_values
+  rint_input_type = ['float','float2','float4','float8','float16']
+  rint_output_type = ['float','float2','float4','float8','float16']
+  rintUtests = func('rint','rint',[rint_input_type],rint_output_type,[rint_input_values],'0 * FLT_ULP')
+  
+  ##### floatn rootn(floatn x, intn y)
+  rootn_input_values1 = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24,2,3,4]
+  rootn_input_values2 = [-1,-2,-3,2,3,6,7,8,9,2,11,12,13,14,15,16,2,2,2,2]
+  rootn_input_type1 = ['float','float2','float4','float8','float16']
+  rootn_input_type2 = ['int','int2','int4','int8','int16']
+  rootn_output_type = ['float','float2','float4','float8','float16']
+  rootn_cpu_func='''
+static float rootn(float x, int y){
+    return pow(x,1.0/y);
+} '''
+  rootnUtests = func('rootn','rootn',[rootn_input_type1,rootn_input_type2],rootn_output_type,[rootn_input_values1,rootn_input_values2],'4 * FLT_ULP',rootn_cpu_func)
+  
+  ##### gentype round(gentype x)
+  round_input_values = base_input_values
+  round_input_type = ['float','float2','float4','float8','float16']
+  round_output_type = ['float','float2','float4','float8','float16']
+  roundUtests = func('round','round',[round_input_type],round_output_type,[round_input_values],'0 * FLT_ULP')
+  
+  ##### gentype rsqrt(gentype)
+  rsqrt_input_values = base_input_values
+  rsqrt_input_type = ['float','float2','float4','float8','float16']
+  rsqrt_output_type = ['float','float2','float4','float8','float16']
+  rsqrt_cpu_func='''
+static float rsqrt(float x)
+{ return 1/sqrt(x);} '''
+  rsqrtUtests = func('rsqrt','rsqrt',[rsqrt_input_type],rsqrt_output_type,[rsqrt_input_values],'4 * FLT_ULP', rsqrt_cpu_func)
+
+ 
+  ##### gentype sin(gentype)
+  sin_input_values = base_input_values
+  sin_input_type = ['float','float2','float4','float8','float16']
+  sin_output_type = ['float','float2','float4','float8','float16']
+  sinUtests = func('sin','sin',[sin_input_type],sin_output_type,[sin_input_values],'4 * FLT_ULP')
+  
+#  ##### gentype sincos(gentype)
+#  sincos_input_values1 = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24]
+#  sincos_input_values2 = []
+#  sincos_input_type1 = ['float','float2','float4','float8','float16']
+#  sincos_input_type2 = ['float','float2','float4','float8','float16']
+#  sincos_output_type = ['float','float2','float4','float8','float16']
+#  ###### gentype sincos(gentype)
+#  #  sincosUtests = func('sincos','sincos',[sincos_input_type1,sincos_input_type2],sincos_output_type,[sincos_input_values1,sincos_input_values2],'4 * FLT_ULP')
+  
+  ##### gentype sinh(gentype)
+  sinh_input_values = base_input_values
+  sinh_input_type = ['float','float2','float4','float8','float16']
+  sinh_output_type = ['float','float2','float4','float8','float16']
+  sinhUtests = func('sinh','sinh',[sinh_input_type],sinh_output_type,[sinh_input_values],'4 * FLT_ULP')
+  
+  ##### gentype sinpi(gentype x)
+  sinpi_input_values = base_input_values
+  sinpi_input_type = ['float','float2','float4','float8','float16']
+  sinpi_output_type = ['float','float2','float4','float8','float16']
+  sinpi_cpu_func='''
+static float sinpi(float x){
+  return sin(M_PI*x);
+} '''
+  sinpiUtests = func('sinpi','sinpi',[sinpi_input_type],sinpi_output_type,[sinpi_input_values],'4 * FLT_ULP',sinpi_cpu_func)
+  
+  ##### gentype sqrt(gentype)
+  sqrt_input_values = base_input_values
+  sqrt_input_type = ['float','float2','float4','float8','float16']
+  sqrt_output_type = ['float','float2','float4','float8','float16']
+  sqrtUtests = func('sqrt','sqrt',[sqrt_input_type],sqrt_output_type,[sqrt_input_values],'4 * FLT_ULP')
+  
+  ##### gentype tan(gentype)
+  tan_input_values = base_input_values
+  tan_input_type = ['float','float2','float4','float8','float16']
+  tan_output_type = ['float','float2','float4','float8','float16']
+  tanUtests = func('tan','tan',[tan_input_type],tan_output_type,[tan_input_values],'5 * FLT_ULP')
+  
+  ##### gentype tanh(gentype)
+  tanh_input_values = base_input_values
+  tanh_input_type = ['float','float2','float4','float8','float16']
+  tanh_output_type = ['float','float2','float4','float8','float16']
+  tanhUtests = func('tanh','tanh',[tanh_input_type],tanh_output_type,[tanh_input_values],'5 * FLT_ULP')
+  
+  ##### gentype tanpi(gentype x)
+  tanpi_input_values = base_input_values
+  tanpi_input_type = ['float','float2','float4','float8','float16']
+  tanpi_output_type = ['float','float2','float4','float8','float16']
+  tanpi_cpu_func='''
+static float tanpi(float x){
+  return tan(M_PI*x);
+} '''
+  tanpiUtests = func('tanpi','tanpi',[tanpi_input_type],tanpi_output_type,[tanpi_input_values],'4 * FLT_ULP',tanpi_cpu_func)
+  
+  ##### gentype tgamma(gentype)
+  tgamma_input_values = base_input_values
+  tgamma_input_type = ['float','float2','float4','float8','float16']
+  tgamma_output_type = ['float','float2','float4','float8','float16']
+  tgammaUtests = func('tgamma','tgamma',[tgamma_input_type],tgamma_output_type,[tgamma_input_values],'16 * FLT_ULP')
+  
+  ##### gentype trunc(gentype)
+  trunc_input_values = base_input_values
+  trunc_input_type = ['float','float2','float4','float8','float16']
+  trunc_output_type = ['float','float2','float4','float8','float16']
+  truncUtests = func('trunc','trunc',[trunc_input_type],trunc_output_type,[trunc_input_values],'0 * FLT_ULP')
+
+if __name__ == "__main__":
+  main()
diff --git a/utests/utest_run.cpp b/utests/utest_run.cpp
index 94fbbee..cd4356a 100644
--- a/utests/utest_run.cpp
+++ b/utests/utest_run.cpp
@@ -69,7 +69,7 @@ int main(int argc, char *argv[])
     optarg = argv[1];
   }
 
-  {
+  do {
     switch (c)
     {
       case 'c':
@@ -111,7 +111,7 @@ int main(int argc, char *argv[])
         usage();
         exit(1);
     }
-  } while ((c = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1)
+  } while ((c = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1);
 
   cl_ocl_destroy();
 }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git



More information about the Pkg-opencl-devel mailing list