[Pkg-opencl-devel] [beignet] 01/01: imported dfsg-cleaned upstream version 0.9.3~dfsg

Andreas Beckmann anbe at moszumanska.debian.org
Fri Oct 31 21:38:21 UTC 2014


This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to tag upstream/0.9.3_dfsg
in repository beignet.

commit 02e7a3971ff6e85c15e2075e4b296858a2c18cde
Author: Andreas Beckmann <anbe at debian.org>
Date:   Fri Oct 31 20:34:28 2014 +0100

    imported dfsg-cleaned upstream version 0.9.3~dfsg
    
    some files were removed due dfsg violations, for details see
    https://bugs.debian.org/767387
    http://lists.freedesktop.org/archives/beignet/2014-October/004343.html
    
    undistributable - derived from the Len(n)a standard test image
    (https://bugs.debian.org/758442)
     kernels/compiler_box_blur_float_ref.bmp
     kernels/compiler_box_blur_ref.bmp
     kernels/lenna128x128.bmp
    
    unclear license - derived from Shadertoy shaders (stated in
    utests/compiler_shader_toy.cpp) these particular shaders are no longer
    on Shadertoy, but the default license there is CC-BY-NC-SA
    (https://www.shadertoy.com/terms)
     kernels/compiler_chocolux.cl
     kernels/compiler_chocolux_ref.bmp
     kernels/compiler_clod.cl
     kernels/compiler_clod_function_call.cl
     kernels/compiler_clod_ref.bmp
     kernels/compiler_julia.cl
     kernels/compiler_julia_function_call.cl
     kernels/compiler_julia_no_break.cl
     kernels/compiler_julia_no_break_ref.bmp
     kernels/compiler_julia_ref.bmp
     kernels/compiler_menger_sponge.cl
     kernels/compiler_menger_sponge_no_shadow.cl
     kernels/compiler_menger_sponge_no_shadow_ref.bmp
     kernels/compiler_menger_sponge_ref.bmp
     kernels/compiler_nautilus.cl
     kernels/compiler_nautilus_ref.bmp
     kernels/compiler_ribbon.cl
     kernels/compiler_ribbon_ref.bmp
---
 .gitignore                                         |     5 +
 CMake/CMakeConfigTemplate.hpp                      |    28 +
 CMake/FindLLVM.cmake                               |   107 +
 CMake/FindMesaSrc.cmake                            |    26 +
 CMake/FindOCLIcd.cmake                             |    24 +
 CMakeLists.txt                                     |   171 +
 COPYING                                            |   502 +
 NEWS.mdwn                                          |     1 +
 README.md                                          |     1 +
 backend/CMakeLists.txt                             |   108 +
 backend/kernels/compile.sh                         |     6 +
 backend/src/.gitignore                             |     7 +
 backend/src/CMakeLists.txt                         |   236 +
 backend/src/GBEConfig.h.in                         |     7 +
 backend/src/backend/context.cpp                    |   585 +
 backend/src/backend/context.hpp                    |   149 +
 backend/src/backend/gen/gen_mesa_disasm.c          |  1302 ++
 backend/src/backend/gen/gen_mesa_disasm.h          |    45 +
 backend/src/backend/gen75_context.cpp              |   112 +
 backend/src/backend/gen75_context.hpp              |    62 +
 backend/src/backend/gen75_encoder.cpp              |   269 +
 backend/src/backend/gen75_encoder.hpp              |    60 +
 backend/src/backend/gen_context.cpp                |  1911 ++
 backend/src/backend/gen_context.hpp                |   224 +
 backend/src/backend/gen_defs.hpp                   |   974 ++
 backend/src/backend/gen_encoder.cpp                |  1311 ++
 backend/src/backend/gen_encoder.hpp                |   241 +
 backend/src/backend/gen_insn_compact.cpp           |   523 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |    42 +
 backend/src/backend/gen_insn_scheduling.cpp        |   722 +
 backend/src/backend/gen_insn_scheduling.hpp        |    42 +
 backend/src/backend/gen_insn_selection.cpp         |  4032 +++++
 backend/src/backend/gen_insn_selection.hpp         |   290 +
 backend/src/backend/gen_insn_selection.hxx         |    86 +
 backend/src/backend/gen_program.cpp                |   444 +
 backend/src/backend/gen_program.h                  |    38 +
 backend/src/backend/gen_program.hpp                |    86 +
 backend/src/backend/gen_reg_allocation.cpp         |  1218 ++
 backend/src/backend/gen_reg_allocation.hpp         |    73 +
 backend/src/backend/gen_register.hpp               |  1060 ++
 backend/src/backend/program.cpp                    |  1317 ++
 backend/src/backend/program.h                      |   358 +
 backend/src/backend/program.hpp                    |   320 +
 backend/src/builtin_vector_proto.def               |   295 +
 backend/src/gbe_bin_generater.cpp                  |   437 +
 backend/src/gbe_bin_interpreter.cpp                |    80 +
 backend/src/gen_as.sh                              |   101 +
 backend/src/gen_builtin_vector.py                  |   384 +
 backend/src/gen_convert.sh                         |   553 +
 backend/src/genconfig.sh                           |    11 +
 backend/src/ir/constant.cpp                        |   141 +
 backend/src/ir/constant.hpp                        |   134 +
 backend/src/ir/context.cpp                         |   182 +
 backend/src/ir/context.hpp                         |   252 +
 backend/src/ir/function.cpp                        |   359 +
 backend/src/ir/function.hpp                        |   400 +
 backend/src/ir/image.cpp                           |   278 +
 backend/src/ir/image.hpp                           |   102 +
 backend/src/ir/immediate.cpp                       |   263 +
 backend/src/ir/immediate.hpp                       |   264 +
 backend/src/ir/instruction.cpp                     |  1684 ++
 backend/src/ir/instruction.hpp                     |   687 +
 backend/src/ir/instruction.hxx                     |    95 +
 backend/src/ir/liveness.cpp                        |   240 +
 backend/src/ir/liveness.hpp                        |   148 +
 backend/src/ir/lowering.cpp                        |   396 +
 backend/src/ir/lowering.hpp                        |    94 +
 backend/src/ir/printf.cpp                          |   222 +
 backend/src/ir/printf.hpp                          |   244 +
 backend/src/ir/profile.cpp                         |   106 +
 backend/src/ir/profile.hpp                         |    86 +
 backend/src/ir/register.cpp                        |    67 +
 backend/src/ir/register.hpp                        |   170 +
 backend/src/ir/sampler.cpp                         |   139 +
 backend/src/ir/sampler.hpp                         |    94 +
 backend/src/ir/type.cpp                            |    51 +
 backend/src/ir/type.hpp                            |    97 +
 backend/src/ir/unit.cpp                            |    61 +
 backend/src/ir/unit.hpp                            |    92 +
 backend/src/ir/value.cpp                           |   607 +
 backend/src/ir/value.hpp                           |   266 +
 backend/src/llvm/llvm_barrier_nodup.cpp            |   115 +
 backend/src/llvm/llvm_gen_backend.cpp              |  3628 ++++
 backend/src/llvm/llvm_gen_backend.hpp              |   105 +
 backend/src/llvm/llvm_gen_ocl_function.hxx         |   196 +
 backend/src/llvm/llvm_intrinsic_lowering.cpp       |   170 +
 backend/src/llvm/llvm_loadstore_optimization.cpp   |   272 +
 backend/src/llvm/llvm_passes.cpp                   |   399 +
 backend/src/llvm/llvm_printf_parser.cpp            |   851 +
 backend/src/llvm/llvm_scalarize.cpp                |   878 +
 backend/src/llvm/llvm_to_gen.cpp                   |   252 +
 backend/src/llvm/llvm_to_gen.hpp                   |    40 +
 backend/src/ocl_as.h                               |  3086 ++++
 backend/src/ocl_barrier.ll                         |    39 +
 backend/src/ocl_common_defines.h                   |   126 +
 backend/src/ocl_convert.h                          | 17415 +++++++++++++++++++
 backend/src/ocl_memcpy.ll                          |   336 +
 backend/src/ocl_memset.ll                          |   127 +
 backend/src/ocl_stdlib.tmpl.h                      |  5160 ++++++
 backend/src/sys/alloc.cpp                          |   359 +
 backend/src/sys/alloc.hpp                          |   342 +
 backend/src/sys/assert.cpp                         |    81 +
 backend/src/sys/assert.hpp                         |    35 +
 backend/src/sys/atomic.hpp                         |    56 +
 backend/src/sys/cvar.cpp                           |    65 +
 backend/src/sys/cvar.hpp                           |    80 +
 backend/src/sys/exception.hpp                      |    56 +
 backend/src/sys/fixed_array.hpp                    |    84 +
 backend/src/sys/hash_map.hpp                       |    82 +
 backend/src/sys/intrinsics.hpp                     |   209 +
 backend/src/sys/intrusive_list.cpp                 |    66 +
 backend/src/sys/intrusive_list.hpp                 |   176 +
 backend/src/sys/list.hpp                           |    65 +
 backend/src/sys/map.hpp                            |    75 +
 backend/src/sys/mutex.cpp                          |    48 +
 backend/src/sys/mutex.hpp                          |    74 +
 backend/src/sys/platform.cpp                       |    79 +
 backend/src/sys/platform.hpp                       |   441 +
 backend/src/sys/set.hpp                            |    70 +
 backend/src/sys/vector.hpp                         |    79 +
 backend/src/update.sh                              |     3 +
 backend/src/update_as.sh                           |    11 +
 backend/src/update_blob_ocl_header.py              |    65 +
 backend/src/update_convert.sh                      |    12 +
 benchmark/CMakeLists.txt                           |    21 +
 benchmark/benchmark_run.cpp                        |   117 +
 benchmark/enqueue_copy_buf.cpp                     |    69 +
 docs/Beignet.mdwn                                  |   230 +
 docs/Beignet/Backend.mdwn                          |    96 +
 docs/Beignet/Backend/TODO.mdwn                     |   110 +
 docs/Beignet/Backend/compiler_backend.mdwn         |   118 +
 docs/Beignet/Backend/gen_ir.mdwn                   |   254 +
 docs/Beignet/Backend/mixed_buffer_pointer.mdwn     |    46 +
 docs/Beignet/Backend/unstructured_branches.mdwn    |   271 +
 docs/NEWS.mdwn                                     |    16 +
 docs/howto/cross-compiler-howto.mdwn               |    60 +
 docs/optimization-guide.mdwn                       |    28 +
 include/CL/cl.h                                    |  1214 ++
 include/CL/cl.hpp                                  | 12452 +++++++++++++
 include/CL/cl_d3d10.h                              |   126 +
 include/CL/cl_d3d11.h                              |   126 +
 include/CL/cl_dx9_media_sharing.h                  |   127 +
 include/CL/cl_egl.h                                |   133 +
 include/CL/cl_ext.h                                |   316 +
 include/CL/cl_gl.h                                 |   162 +
 include/CL/cl_gl_ext.h                             |    69 +
 include/CL/cl_intel.h                              |   141 +
 include/CL/cl_platform.h                           |  1278 ++
 include/CL/opencl.h                                |    54 +
 include/CMakeLists.txt                             |     5 +
 intel-beignet.icd.in                               |     1 +
 kernels/buildin_work_dim.cl                        |     3 +
 kernels/builtin_acos_asin.cl                       |    10 +
 kernels/builtin_atan2.cl                           |     4 +
 kernels/builtin_bitselect.cl                       |     4 +
 kernels/builtin_convert_sat.cl                     |    48 +
 kernels/builtin_exp.cl                             |    10 +
 kernels/builtin_frexp.cl                           |     4 +
 kernels/builtin_global_id.cl                       |     4 +
 kernels/builtin_global_size.cl                     |     3 +
 kernels/builtin_lgamma.cl                          |     4 +
 kernels/builtin_lgamma_r.cl                        |     4 +
 kernels/builtin_local_id.cl                        |     6 +
 kernels/builtin_local_size.cl                      |     3 +
 kernels/builtin_mad_sat.cl                         |     4 +
 kernels/builtin_modf.cl                            |     6 +
 kernels/builtin_nextafter.cl                       |     4 +
 kernels/builtin_num_groups.cl                      |     3 +
 kernels/builtin_pow.cl                             |     7 +
 kernels/builtin_remquo.cl                          |     6 +
 kernels/builtin_shuffle.cl                         |     8 +
 kernels/builtin_shuffle2.cl                        |    13 +
 kernels/builtin_sign.cl                            |     4 +
 kernels/builtin_sinpi.cl                           |     4 +
 kernels/builtin_tgamma.cl                          |     4 +
 kernels/compare_image_2d_and_1d_array.cl           |    13 +
 kernels/compiler_abs.cl                            |    28 +
 kernels/compiler_abs_diff.cl                       |    30 +
 kernels/compiler_address_space.cl                  |     9 +
 kernels/compiler_argument_structure.cl             |     9 +
 kernels/compiler_argument_structure_indirect.cl    |     9 +
 kernels/compiler_arith_shift_right.cl              |     4 +
 kernels/compiler_array.cl                          |    14 +
 kernels/compiler_array0.cl                         |    16 +
 kernels/compiler_array1.cl                         |    15 +
 kernels/compiler_array2.cl                         |    13 +
 kernels/compiler_array3.cl                         |    14 +
 kernels/compiler_async_copy.cl                     |    24 +
 kernels/compiler_async_copy_and_prefetch.cl        |     9 +
 kernels/compiler_async_stride_copy.cl              |    16 +
 kernels/compiler_atomic_functions.cl               |    50 +
 kernels/compiler_basic_arithmetic.cl               |    53 +
 kernels/compiler_bool_cross_basic_block.cl         |    21 +
 kernels/compiler_box_blur.cl                       |    80 +
 kernels/compiler_box_blur_float.cl                 |    48 +
 kernels/compiler_box_blur_image.cl                 |    18 +
 kernels/compiler_byte_scatter.cl                   |     7 +
 kernels/compiler_ceil.cl                           |     4 +
 kernels/compiler_clz_int.cl                        |     5 +
 kernels/compiler_clz_short.cl                      |     5 +
 kernels/compiler_constant_expr.cl                  |    23 +
 kernels/compiler_convert_uchar_sat.cl              |     4 +
 kernels/compiler_data_types.cl                     |    80 +
 kernels/compiler_degrees.cl                        |     4 +
 kernels/compiler_displacement_map_element.cl       |    11 +
 kernels/compiler_double.cl                         |     9 +
 kernels/compiler_double_2.cl                       |     9 +
 kernels/compiler_double_3.cl                       |     6 +
 kernels/compiler_double_4.cl                       |     5 +
 kernels/compiler_event.cl                          |     6 +
 kernels/compiler_fabs.cl                           |     5 +
 kernels/compiler_function_argument.cl              |     7 +
 kernels/compiler_function_argument0.cl             |     7 +
 kernels/compiler_function_argument1.cl             |     7 +
 kernels/compiler_function_argument2.cl             |    12 +
 kernels/compiler_function_argument3.cl             |    71 +
 kernels/compiler_function_constant.cl              |     6 +
 kernels/compiler_function_constant0.cl             |     6 +
 kernels/compiler_function_qualifiers.cl            |     9 +
 kernels/compiler_gather_register_file.cl           |    10 +
 kernels/compiler_gather_register_file0.cl          |    10 +
 kernels/compiler_gather_register_file1.cl          |    11 +
 kernels/compiler_geometric_builtin.cl              |    11 +
 kernels/compiler_getelementptr_bitcast.cl          |    18 +
 kernels/compiler_global_constant.cl                |    76 +
 kernels/compiler_global_constant_2.cl              |    20 +
 kernels/compiler_global_memory_barrier.cl          |     7 +
 kernels/compiler_group_size.cl                     |    29 +
 kernels/compiler_hadd.cl                           |     4 +
 kernels/compiler_if_else.cl                        |    14 +
 kernels/compiler_insert_to_constant.cl             |     6 +
 kernels/compiler_insert_vector.cl                  |    11 +
 kernels/compiler_insn_selection_masked_min_max.cl  |    11 +
 kernels/compiler_insn_selection_max.cl             |     7 +
 kernels/compiler_insn_selection_min.cl             |     7 +
 kernels/compiler_integer_builtin.cl                |    23 +
 kernels/compiler_integer_division.cl               |     6 +
 kernels/compiler_integer_remainder.cl              |     6 +
 kernels/compiler_load_bool_imm.cl                  |    12 +
 kernels/compiler_local_memory_barrier.cl           |     6 +
 kernels/compiler_local_memory_barrier_2.cl         |     7 +
 kernels/compiler_local_memory_barrier_wg64.cl      |     6 +
 kernels/compiler_local_memory_two_ptr.cl           |    10 +
 kernels/compiler_local_slm.cl                      |    24 +
 kernels/compiler_long.cl                           |     8 +
 kernels/compiler_long_2.cl                         |    20 +
 kernels/compiler_long_asr.cl                       |     7 +
 kernels/compiler_long_cmp.cl                       |    29 +
 kernels/compiler_long_convert.cl                   |    19 +
 kernels/compiler_long_mult.cl                      |     7 +
 kernels/compiler_long_shl.cl                       |     7 +
 kernels/compiler_long_shr.cl                       |     7 +
 kernels/compiler_lower_return0.cl                  |     8 +
 kernels/compiler_lower_return1.cl                  |     8 +
 kernels/compiler_lower_return2.cl                  |    11 +
 kernels/compiler_mad24.cl                          |     4 +
 kernels/compiler_mad_hi.cl                         |     4 +
 kernels/compiler_mandelbrot.cl                     |    47 +
 kernels/compiler_mandelbrot_alternate.cl           |    38 +
 kernels/compiler_mandelbrot_alternate_ref.bmp      |   Bin 0 -> 196662 bytes
 kernels/compiler_mandelbrot_ref.bmp                |   Bin 0 -> 196662 bytes
 kernels/compiler_math.cl                           |    40 +
 kernels/compiler_math_2op.cl                       |    19 +
 kernels/compiler_math_3op.cl                       |     9 +
 kernels/compiler_math_builtin.cl                   |    82 +
 kernels/compiler_math_constants.cl                 |    23 +
 kernels/compiler_mem_fence.cl                      |    10 +
 kernels/compiler_mixed_pointer.cl                  |    23 +
 kernels/compiler_mul24.cl                          |     4 +
 kernels/compiler_mul_hi.cl                         |     4 +
 kernels/compiler_multiple_kernels.cl               |     7 +
 kernels/compiler_obread.cl                         |     8 +
 kernels/compiler_obwrite.cl                        |     8 +
 kernels/compiler_preprocessor_macros.cl            |    13 +
 kernels/compiler_private_data_overflow.cl          |    10 +
 kernels/compiler_radians.cl                        |     4 +
 kernels/compiler_region.cl                         |    10 +
 kernels/compiler_region0.cl                        |    11 +
 kernels/compiler_region1.cl                        |     9 +
 kernels/compiler_relational_builtin.cl             |    24 +
 kernels/compiler_rhadd.cl                          |     4 +
 kernels/compiler_rotate.cl                         |     5 +
 kernels/compiler_sampler.cl                        |    25 +
 kernels/compiler_saturate.cl                       |    16 +
 kernels/compiler_saturate_sub.cl                   |    16 +
 kernels/compiler_shift_right.cl                    |     4 +
 kernels/compiler_short_scatter.cl                  |     7 +
 kernels/compiler_simd_all.cl                       |    12 +
 kernels/compiler_simd_any.cl                       |    15 +
 kernels/compiler_smoothstep.cl                     |     4 +
 kernels/compiler_step.cl                           |    38 +
 kernels/compiler_structure_attributes.cl           |    17 +
 kernels/compiler_switch.cl                         |    14 +
 kernels/compiler_type_casting.cl                   |    19 +
 kernels/compiler_uint16_copy.cl                    |     8 +
 kernels/compiler_uint2_copy.cl                     |     7 +
 kernels/compiler_uint3_copy.cl                     |     7 +
 kernels/compiler_uint3_unaligned_copy.cl           |     8 +
 kernels/compiler_uint8_copy.cl                     |     7 +
 kernels/compiler_unstructured_branch0.cl           |    14 +
 kernels/compiler_unstructured_branch1.cl           |    14 +
 kernels/compiler_unstructured_branch2.cl           |    18 +
 kernels/compiler_unstructured_branch3.cl           |    16 +
 kernels/compiler_upsample_int.cl                   |     4 +
 kernels/compiler_upsample_long.cl                  |     4 +
 kernels/compiler_vect_compare.cl                   |     7 +
 kernels/compiler_vector_inc.cl                     |    13 +
 kernels/compiler_vector_load_store.cl              |    40 +
 kernels/compiler_volatile.cl                       |     4 +
 kernels/compiler_vote_all.cl                       |    10 +
 kernels/compiler_vote_any.cl                       |    10 +
 kernels/compiler_workitem_builtin.cl               |    12 +
 kernels/compiler_write_only_bytes.cl               |     6 +
 kernels/compiler_write_only_shorts.cl              |     6 +
 kernels/double_precision_check.cl                  |    11 +
 kernels/empty.cl                                   |     1 +
 kernels/image_1D_buffer.cl                         |    13 +
 kernels/include/runtime_compile_link_inc.h         |     4 +
 kernels/my_test.cl                                 |    26 +
 kernels/null_kernel_arg.cl                         |     9 +
 kernels/runtime_compile_link.h                     |     1 +
 kernels/runtime_compile_link_a.cl                  |    13 +
 kernels/runtime_compile_link_b.cl                  |     9 +
 kernels/test_cl_finish.cl                          |    12 +
 kernels/test_copy_buffer.cl                        |     6 +
 kernels/test_copy_buffer_row.cl                    |     8 +
 kernels/test_copy_image.cl                         |    10 +
 kernels/test_copy_image1.cl                        |    33 +
 kernels/test_copy_image_1d.cl                      |     9 +
 kernels/test_copy_image_3d.cl                      |    28 +
 kernels/test_fill_gl_image.cl                      |    11 +
 kernels/test_fill_image.cl                         |    13 +
 kernels/test_fill_image0.cl                        |     9 +
 kernels/test_fill_image_1d.cl                      |     8 +
 kernels/test_fill_image_3d.cl                      |    14 +
 kernels/test_fill_image_3d_2.cl                    |    10 +
 kernels/test_get_arg_info.cl                       |     8 +
 kernels/test_get_image_info.cl                     |    13 +
 kernels/test_get_image_info_array.cl               |    25 +
 kernels/test_movforphi_undef.cl                    |    18 +
 kernels/test_printf.cl                             |    38 +
 kernels/test_write_only.cl                         |     6 +
 setup_fulsim_hsw.sh                                |     5 +
 setup_fulsim_ivb.sh                                |     5 +
 setup_perfsim_ivb.sh                               |     4 +
 src/.gitignore                                     |     2 +
 src/CMakeLists.txt                                 |   126 +
 src/OCLConfig.h.in                                 |     6 +
 src/cl_alloc.c                                     |    88 +
 src/cl_alloc.h                                     |    47 +
 src/cl_api.c                                       |  3341 ++++
 src/cl_command_queue.c                             |   622 +
 src/cl_command_queue.h                             |   109 +
 src/cl_command_queue_gen7.c                        |   394 +
 src/cl_context.c                                   |   372 +
 src/cl_context.h                                   |   166 +
 src/cl_device_data.h                               |   194 +
 src/cl_device_id.c                                 |   617 +
 src/cl_device_id.h                                 |   145 +
 src/cl_driver.cpp                                  |    40 +
 src/cl_driver.h                                    |   383 +
 src/cl_driver_defs.c                               |    95 +
 src/cl_driver_type.h                               |    24 +
 src/cl_enqueue.c                                   |   472 +
 src/cl_enqueue.h                                   |    73 +
 src/cl_event.c                                     |   650 +
 src/cl_event.h                                     |   106 +
 src/cl_extensions.c                                |   107 +
 src/cl_extensions.h                                |    99 +
 src/cl_gbe_loader.cpp                              |   328 +
 src/cl_gbe_loader.h                                |    80 +
 src/cl_gen75_device.h                              |    30 +
 src/cl_gen7_device.h                               |    29 +
 src/cl_gl_api.c                                    |   153 +
 src/cl_gt_device.h                                 |   124 +
 src/cl_image.c                                     |   229 +
 src/cl_image.h                                     |    44 +
 src/cl_internals.h                                 |    36 +
 src/cl_kernel.c                                    |   431 +
 src/cl_kernel.h                                    |   116 +
 src/cl_khr_icd.c                                   |   174 +
 src/cl_khr_icd.h                                   |    34 +
 src/cl_mem.c                                       |  1903 ++
 src/cl_mem.h                                       |   290 +
 src/cl_mem_gl.c                                    |    97 +
 src/cl_mem_gl.h                                    |    17 +
 src/cl_platform_id.c                               |   112 +
 src/cl_platform_id.h                               |    72 +
 src/cl_program.c                                   |   851 +
 src/cl_program.h                                   |   136 +
 src/cl_sampler.c                                   |   142 +
 src/cl_sampler.h                                   |    57 +
 src/cl_thread.c                                    |   265 +
 src/cl_thread.h                                    |    47 +
 src/cl_utils.h                                     |   316 +
 src/intel/intel_batchbuffer.c                      |   191 +
 src/intel/intel_batchbuffer.h                      |   152 +
 src/intel/intel_defines.h                          |   339 +
 src/intel/intel_dri_resource_sharing.c             |   208 +
 src/intel/intel_dri_resource_sharing.h             |    39 +
 src/intel/intel_dri_resource_sharing_int.h         |   143 +
 src/intel/intel_driver.c                           |   744 +
 src/intel/intel_driver.h                           |   125 +
 src/intel/intel_gpgpu.c                            |  1513 ++
 src/intel/intel_gpgpu.h                            |    34 +
 src/intel/intel_structs.h                          |   461 +
 src/kernels/cl_internal_copy_buf_align16.cl        |    12 +
 src/kernels/cl_internal_copy_buf_align4.cl         |     8 +
 src/kernels/cl_internal_copy_buf_rect.cl           |    15 +
 .../cl_internal_copy_buf_unalign_dst_offset.cl     |    28 +
 .../cl_internal_copy_buf_unalign_same_offset.cl    |    19 +
 .../cl_internal_copy_buf_unalign_src_offset.cl     |    29 +
 src/kernels/cl_internal_copy_buffer_to_image_2d.cl |    18 +
 src/kernels/cl_internal_copy_buffer_to_image_3d.cl |    19 +
 src/kernels/cl_internal_copy_image_1d_to_1d.cl     |    19 +
 src/kernels/cl_internal_copy_image_2d_to_2d.cl     |    21 +
 src/kernels/cl_internal_copy_image_2d_to_3d.cl     |    22 +
 src/kernels/cl_internal_copy_image_2d_to_buffer.cl |    19 +
 src/kernels/cl_internal_copy_image_3d_to_2d.cl     |    22 +
 src/kernels/cl_internal_copy_image_3d_to_3d.cl     |    23 +
 src/kernels/cl_internal_copy_image_3d_to_buffer.cl |    22 +
 src/kernels/cl_internal_fill_buf_align128.cl       |     9 +
 src/kernels/cl_internal_fill_buf_align2.cl         |     8 +
 src/kernels/cl_internal_fill_buf_align4.cl         |     8 +
 src/kernels/cl_internal_fill_buf_align8.cl         |    14 +
 src/kernels/cl_internal_fill_buf_unalign.cl        |     8 +
 src/kernels/cl_internal_fill_image_1d.cl           |    14 +
 src/kernels/cl_internal_fill_image_1d_array.cl     |    15 +
 src/kernels/cl_internal_fill_image_2d.cl           |    15 +
 src/kernels/cl_internal_fill_image_2d_array.cl     |    16 +
 src/kernels/cl_internal_fill_image_3d.cl           |    16 +
 src/performance.c                                  |   324 +
 src/performance.h                                  |    12 +
 src/x11/dricommon.c                                |   330 +
 src/x11/dricommon.h                                |    99 +
 src/x11/mesa_egl_extension.c                       |   307 +
 src/x11/mesa_egl_extension.h                       |    20 +
 src/x11/mesa_egl_res_share.c                       |   135 +
 src/x11/mesa_egl_res_share.h                       |    44 +
 src/x11/va_dri2.c                                  |   327 +
 src/x11/va_dri2.h                                  |    89 +
 src/x11/va_dri2str.h                               |   211 +
 src/x11/va_dri2tokens.h                            |    66 +
 utests/.gitignore                                  |    15 +
 utests/CMakeLists.txt                              |   241 +
 utests/buildin_work_dim.cpp                        |    37 +
 utests/builtin_acos_asin.cpp                       |    87 +
 utests/builtin_atan2.cpp                           |    43 +
 utests/builtin_bitselect.cpp                       |    50 +
 utests/builtin_convert_sat.cpp                     |    80 +
 utests/builtin_exp.cpp                             |   102 +
 utests/builtin_frexp.cpp                           |    50 +
 utests/builtin_global_id.cpp                       |    77 +
 utests/builtin_global_size.cpp                     |   108 +
 utests/builtin_kernel_max_global_size.cpp          |    30 +
 utests/builtin_lgamma.cpp                          |    40 +
 utests/builtin_lgamma_r.cpp                        |    46 +
 utests/builtin_local_id.cpp                        |    81 +
 utests/builtin_local_size.cpp                      |    88 +
 utests/builtin_mad_sat.cpp                         |    44 +
 utests/builtin_modf.cpp                            |    56 +
 utests/builtin_nextafter.cpp                       |    60 +
 utests/builtin_num_groups.cpp                      |    85 +
 utests/builtin_pow.cpp                             |    92 +
 utests/builtin_remquo.cpp                          |    65 +
 utests/builtin_shuffle.cpp                         |    45 +
 utests/builtin_shuffle2.cpp                        |    45 +
 utests/builtin_sign.cpp                            |    47 +
 utests/builtin_sinpi.cpp                           |   104 +
 utests/builtin_tgamma.cpp                          |    42 +
 utests/cl_create_kernel.cpp                        |    16 +
 utests/compare_image_2d_and_1d_array.cpp           |    79 +
 utests/compiler_abs.cpp                            |   254 +
 utests/compiler_abs_diff.cpp                       |   295 +
 utests/compiler_address_space.cpp                  |    10 +
 utests/compiler_argument_structure.cpp             |    28 +
 utests/compiler_argument_structure_indirect.cpp    |    29 +
 utests/compiler_arith_shift_right.cpp              |    43 +
 utests/compiler_array.cpp                          |    28 +
 utests/compiler_array0.cpp                         |    54 +
 utests/compiler_array1.cpp                         |    52 +
 utests/compiler_array2.cpp                         |    50 +
 utests/compiler_array3.cpp                         |    51 +
 utests/compiler_async_copy.cpp                     |    55 +
 utests/compiler_async_copy_and_prefetch.cpp        |    10 +
 utests/compiler_async_stride_copy.cpp              |    45 +
 utests/compiler_atomic_functions.cpp               |    97 +
 utests/compiler_basic_arithmetic.cpp               |   115 +
 utests/compiler_bool_cross_basic_block.cpp         |    55 +
 utests/compiler_box_blur.cpp                       |    43 +
 utests/compiler_box_blur_float.cpp                 |    65 +
 utests/compiler_box_blur_image.cpp                 |    52 +
 utests/compiler_byte_scatter.cpp                   |    24 +
 utests/compiler_ceil.cpp                           |    43 +
 utests/compiler_cl_finish.cpp                      |    50 +
 utests/compiler_clz_int.cpp                        |    31 +
 utests/compiler_clz_short.cpp                      |    31 +
 utests/compiler_constant_expr.cpp                  |    35 +
 utests/compiler_convert_uchar_sat.cpp              |    44 +
 utests/compiler_copy_buffer.cpp                    |    32 +
 utests/compiler_copy_buffer_row.cpp                |    40 +
 utests/compiler_copy_image.cpp                     |    58 +
 utests/compiler_copy_image1.cpp                    |    83 +
 utests/compiler_copy_image_1d.cpp                  |    52 +
 utests/compiler_copy_image_3d.cpp                  |    77 +
 utests/compiler_data_types.cpp                     |     9 +
 utests/compiler_degrees.cpp                        |    32 +
 utests/compiler_displacement_map_element.cpp       |    64 +
 utests/compiler_double.cpp                         |    46 +
 utests/compiler_double_2.cpp                       |    47 +
 utests/compiler_double_3.cpp                       |    46 +
 utests/compiler_double_4.cpp                       |    40 +
 utests/compiler_double_precision.cpp               |    43 +
 utests/compiler_fabs.cpp                           |    44 +
 utests/compiler_fill_gl_image.cpp                  |    76 +
 utests/compiler_fill_image.cpp                     |    44 +
 utests/compiler_fill_image0.cpp                    |    42 +
 utests/compiler_fill_image_1d.cpp                  |    50 +
 utests/compiler_fill_image_3d.cpp                  |    50 +
 utests/compiler_fill_image_3d_2.cpp                |    48 +
 utests/compiler_function_argument.cpp              |    27 +
 utests/compiler_function_argument0.cpp             |    26 +
 utests/compiler_function_argument1.cpp             |    31 +
 utests/compiler_function_argument2.cpp             |    57 +
 utests/compiler_function_argument3.cpp             |    45 +
 utests/compiler_function_constant.cpp              |    34 +
 utests/compiler_function_constant0.cpp             |    40 +
 utests/compiler_function_constant1.cpp             |    47 +
 utests/compiler_function_qualifiers.cpp            |    20 +
 utests/compiler_geometric_builtin.cpp              |     9 +
 utests/compiler_get_image_info.cpp                 |    50 +
 utests/compiler_get_image_info_array.cpp           |    64 +
 utests/compiler_getelementptr_bitcast.cpp          |    45 +
 utests/compiler_global_constant.cpp                |   104 +
 utests/compiler_global_constant_2.cpp              |    59 +
 utests/compiler_global_memory_barrier.cpp          |    28 +
 utests/compiler_group_size.cpp                     |   141 +
 utests/compiler_hadd.cpp                           |    40 +
 utests/compiler_if_else.cpp                        |    64 +
 utests/compiler_insert_to_constant.cpp             |    30 +
 utests/compiler_insert_vector.cpp                  |    18 +
 utests/compiler_insn_selection_masked_min_max.cpp  |    42 +
 utests/compiler_insn_selection_max.cpp             |    37 +
 utests/compiler_insn_selection_min.cpp             |    36 +
 utests/compiler_integer_builtin.cpp                |     9 +
 utests/compiler_integer_division.cpp               |    44 +
 utests/compiler_integer_remainder.cpp              |    44 +
 utests/compiler_load_bool_imm.cpp                  |    29 +
 utests/compiler_local_memory_barrier.cpp           |    46 +
 utests/compiler_local_memory_barrier_2.cpp         |    29 +
 utests/compiler_local_memory_barrier_wg64.cpp      |    46 +
 utests/compiler_local_memory_two_ptr.cpp           |    50 +
 utests/compiler_local_slm.cpp                      |    33 +
 utests/compiler_long.cpp                           |    60 +
 utests/compiler_long_2.cpp                         |    51 +
 utests/compiler_long_asr.cpp                       |    41 +
 utests/compiler_long_cmp.cpp                       |   122 +
 utests/compiler_long_convert.cpp                   |   158 +
 utests/compiler_long_mult.cpp                      |    49 +
 utests/compiler_long_shl.cpp                       |    41 +
 utests/compiler_long_shr.cpp                       |    41 +
 utests/compiler_lower_return0.cpp                  |    54 +
 utests/compiler_lower_return1.cpp                  |    47 +
 utests/compiler_lower_return2.cpp                  |    48 +
 utests/compiler_mad24.cpp                          |    41 +
 utests/compiler_mad_hi.cpp                         |    46 +
 utests/compiler_mandelbrot.cpp                     |    48 +
 utests/compiler_mandelbrot_alternate.cpp           |    54 +
 utests/compiler_math.cpp                           |    89 +
 utests/compiler_math_2op.cpp                       |    80 +
 utests/compiler_math_3op.cpp                       |    64 +
 utests/compiler_math_builtin.cpp                   |     9 +
 utests/compiler_math_constants.cpp                 |     9 +
 utests/compiler_mem_fence.cpp                      |     9 +
 utests/compiler_mixed_pointer.cpp                  |   119 +
 utests/compiler_movforphi_undef.cpp                |    61 +
 utests/compiler_mul24.cpp                          |    36 +
 utests/compiler_mul_hi.cpp                         |    40 +
 utests/compiler_multiple_kernels.cpp               |     8 +
 utests/compiler_preprocessor_macros.cpp            |     9 +
 utests/compiler_private_data_overflow.cpp          |    15 +
 utests/compiler_program_objects.cpp                |    64 +
 utests/compiler_radians.cpp                        |    32 +
 utests/compiler_relational_builtin.cpp             |     9 +
 utests/compiler_rhadd.cpp                          |    41 +
 utests/compiler_rotate.cpp                         |    40 +
 utests/compiler_sampler.cpp                        |    41 +
 utests/compiler_saturate.cpp                       |   114 +
 utests/compiler_saturate_sub.cpp                   |   114 +
 utests/compiler_shader_toy.cpp                     |    87 +
 utests/compiler_shift_right.cpp                    |    45 +
 utests/compiler_short_scatter.cpp                  |    25 +
 utests/compiler_simd_all.cpp                       |    43 +
 utests/compiler_simd_any.cpp                       |    43 +
 utests/compiler_smoothstep.cpp                     |    58 +
 utests/compiler_step.cpp                           |   342 +
 utests/compiler_structure_attributes.cpp           |     9 +
 utests/compiler_switch.cpp                         |    48 +
 utests/compiler_type_casting.cpp                   |    10 +
 utests/compiler_uint16_copy.cpp                    |    35 +
 utests/compiler_uint2_copy.cpp                     |    31 +
 utests/compiler_uint3_copy.cpp                     |    40 +
 utests/compiler_uint3_unaligned_copy.cpp           |    42 +
 utests/compiler_uint8_copy.cpp                     |    35 +
 utests/compiler_unstructured_branch0.cpp           |    55 +
 utests/compiler_unstructured_branch1.cpp           |    54 +
 utests/compiler_unstructured_branch2.cpp           |    68 +
 utests/compiler_unstructured_branch3.cpp           |    58 +
 utests/compiler_upsample_int.cpp                   |    37 +
 utests/compiler_upsample_long.cpp                  |    38 +
 utests/compiler_vect_compare.cpp                   |    44 +
 utests/compiler_vector_inc.cpp                     |    46 +
 utests/compiler_vector_load_store.cpp              |    63 +
 utests/compiler_volatile.cpp                       |     9 +
 utests/compiler_workitem_builtin.cpp               |     9 +
 utests/compiler_write_only.cpp                     |    43 +
 utests/compiler_write_only_bytes.cpp               |    23 +
 utests/compiler_write_only_shorts.cpp              |    24 +
 utests/enqueue_built_in_kernels.cpp                |    19 +
 utests/enqueue_copy_buf.cpp                        |    66 +
 utests/enqueue_copy_buf_unaligned.cpp              |   118 +
 utests/enqueue_fill_buf.cpp                        |    90 +
 utests/get_arg_info.cpp                            |    85 +
 utests/get_cl_info.cpp                             |   641 +
 utests/image_1D_buffer.cpp                         |    80 +
 utests/load_program_from_bin_file.cpp              |    77 +
 utests/load_program_from_gen_bin.cpp               |    93 +
 utests/my_test.cpp                                 |    99 +
 utests/new_data.txt                                |   256 +
 utests/profiling_exec.cpp                          |   102 +
 utests/runtime_barrier_list.cpp                    |    75 +
 utests/runtime_compile_link.cpp                    |   162 +
 utests/runtime_createcontext.cpp                   |    14 +
 utests/runtime_event.cpp                           |    60 +
 utests/runtime_flat_address_space.cpp              |    75 +
 utests/runtime_marker_list.cpp                     |    75 +
 utests/runtime_null_kernel_arg.cpp                 |    27 +
 utests/setenv.sh.in                                |     7 +
 utests/sub_buffer.cpp                              |   135 +
 utests/test_printf.cpp                             |    18 +
 utests/utest.cpp                                   |   183 +
 utests/utest.hpp                                   |   139 +
 utests/utest_assert.cpp                            |    41 +
 utests/utest_assert.hpp                            |    44 +
 utests/utest_error.c                               |    76 +
 utests/utest_error.h                               |    26 +
 utests/utest_exception.hpp                         |    48 +
 utests/utest_file_map.cpp                          |   117 +
 utests/utest_file_map.hpp                          |    84 +
 utests/utest_generator.py                          |   387 +
 utests/utest_helper.cpp                            |   674 +
 utests/utest_helper.hpp                            |   234 +
 utests/utest_math_gen.py                           |   577 +
 utests/utest_run.cpp                               |   118 +
 654 files changed, 120515 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..90fd161
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+*.o
+CMakeCache.txt
+CMakeFiles/
+Makefile
+cmake_install.cmake
diff --git a/CMake/CMakeConfigTemplate.hpp b/CMake/CMakeConfigTemplate.hpp
new file mode 100644
index 0000000..7702c54
--- /dev/null
+++ b/CMake/CMakeConfigTemplate.hpp
@@ -0,0 +1,28 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef CMAKE_CONFIG_HPP
+#define CMAKE_CONFIG_HPP
+
+#define ON true
+#define OFF false
+#define GEN_INSTALLATION_PATH "${CMAKE_INSTALL_PREFIX}/lib/i965/"
+
+#endif /* CMAKE_CONFIG_HPP */
+
diff --git a/CMake/FindLLVM.cmake b/CMake/FindLLVM.cmake
new file mode 100644
index 0000000..556b3a9
--- /dev/null
+++ b/CMake/FindLLVM.cmake
@@ -0,0 +1,107 @@
+# Find the native LLVM includes and library
+#
+# LLVM_INCLUDE_DIR - where to find llvm include files
+# LLVM_LIBRARY_DIR - where to find llvm libs
+# LLVM_CFLAGS      - llvm compiler flags
+# LLVM_LFLAGS      - llvm linker flags
+# LLVM_MODULE_LIBS - list of llvm libs for working with modules.
+# LLVM_FOUND       - True if llvm found.
+if (LLVM_INSTALL_DIR)
+  find_program(LLVM_CONFIG_EXECUTABLE
+               NAMES llvm-config-33 llvm-config-3.3 llvm-config-35 llvm-config-3.5 llvm-config-34 llvm-config-3.4 llvm-config
+               DOC "llvm-config executable"
+               PATHS ${LLVM_INSTALL_DIR} NO_DEFAULT_PATH)
+else (LLVM_INSTALL_DIR)
+  find_program(LLVM_CONFIG_EXECUTABLE
+               NAMES llvm-config-33 llvm-config-3.3 llvm-config-35 llvm-config-3.5 llvm-config-34 llvm-config-3.4 llvm-config
+               DOC "llvm-config executable")
+endif (LLVM_INSTALL_DIR)
+
+if (LLVM_CONFIG_EXECUTABLE)
+  message(STATUS "LLVM llvm-config found at: ${LLVM_CONFIG_EXECUTABLE}")
+else (LLVM_CONFIG_EXECUTABLE)
+  message(FATAL_ERROR "Could NOT find LLVM executable, please add -DLLVM_INSTALL_DIR=/path/to/llvm-config/ in cmake command")
+endif (LLVM_CONFIG_EXECUTABLE)
+
+if (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
+  SET(LLVM_FIND_VERSION_NODOT "${LLVM_FIND_VERSION_MAJOR}${LLVM_FIND_VERSION_MINOR}")
+  execute_process(
+    COMMAND ${LLVM_CONFIG_EXECUTABLE} --version
+    OUTPUT_VARIABLE LLVM_VERSION
+  )
+  string(REGEX REPLACE "([0-9])\\.([0-9]*).*" "\\1\\2 " LLVM_VERSION_NODOT ${LLVM_VERSION})
+  if (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
+    message(FATAL_ERROR "imcompatible LLVM version ${LLVM_VERSION} required ${LLVM_FIND_VERSION}")
+  else (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
+    if (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_FIND_VERSION_NODOT)
+      message(STATUS "find stable LLVM version ${LLVM_VERSION}")
+    else (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_FIND_VERSION_NODOT)
+      message(STATUS "find unstable LLVM version ${LLVM_VERSION}")
+    endif (LLVM_VERSION_NODOT VERSION_EQUAL LLVM_FIND_VERSION_NODOT)
+    add_definitions("-DLLVM_${LLVM_VERSION_NODOT}")
+  endif (LLVM_VERSION_NODOT VERSION_LESS LLVM_FIND_VERSION_NODOT)
+endif (LLVM_FIND_VERSION_MAJOR AND LLVM_FIND_VERSION_MINOR)
+
+execute_process(
+  COMMAND ${LLVM_CONFIG_EXECUTABLE} --includedir
+  OUTPUT_VARIABLE LLVM_INCLUDE_DIR
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+  COMMAND ${LLVM_CONFIG_EXECUTABLE} --libdir
+  OUTPUT_VARIABLE LLVM_LIBRARY_DIR
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+  COMMAND ${LLVM_CONFIG_EXECUTABLE} --cppflags
+  OUTPUT_VARIABLE LLVM_CFLAGS
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+  COMMAND ${LLVM_CONFIG_EXECUTABLE} --ldflags
+  OUTPUT_VARIABLE LLVM_LFLAGS
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+  COMMAND ${LLVM_CONFIG_EXECUTABLE} --libs
+  OUTPUT_VARIABLE LLVM_MODULE_LIBS
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+if (LLVM_VERSION_NODOT VERSION_GREATER 34)
+execute_process(
+  COMMAND ${LLVM_CONFIG_EXECUTABLE} --system-libs
+  OUTPUT_VARIABLE LLVM_SYSTEM_LIBS_ORIG
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+string(REGEX REPLACE " *\n" "" LLVM_SYSTEM_LIBS ${LLVM_SYSTEM_LIBS_ORIG})
+endif (LLVM_VERSION_NODOT VERSION_GREATER 34)
+
+macro(add_one_lib name)
+  FIND_LIBRARY(CLANG_LIB
+    NAMES ${name}
+    PATHS ${LLVM_LIBRARY_DIR} NO_DEFAULT_PATH)
+  set(CLANG_LIBRARIES ${CLANG_LIBRARIES} ${CLANG_LIB})
+	unset(CLANG_LIB CACHE)
+endmacro()
+
+#Assume clang lib path same as llvm lib path
+add_one_lib("clangFrontend")
+add_one_lib("clangSerialization")
+add_one_lib("clangDriver")
+add_one_lib("clangCodeGen")
+add_one_lib("clangSema")
+add_one_lib("clangStaticAnalyzerFrontend")
+add_one_lib("clangStaticAnalyzerCheckers")
+add_one_lib("clangStaticAnalyzerCore")
+add_one_lib("clangAnalysis")
+add_one_lib("clangEdit")
+add_one_lib("clangAST")
+add_one_lib("clangParse")
+add_one_lib("clangSema")
+add_one_lib("clangLex")
+add_one_lib("clangBasic")
diff --git a/CMake/FindMesaSrc.cmake b/CMake/FindMesaSrc.cmake
new file mode 100644
index 0000000..978cb4e
--- /dev/null
+++ b/CMake/FindMesaSrc.cmake
@@ -0,0 +1,26 @@
+#
+# Try to find mesa source code
+# Once done this will define
+#
+# MESA_SOURCE_FOUND
+# MESA_SOURCE_INCLUDES
+#
+
+# Find mesa source code.
+FIND_PATH(MESA_SOURCE_PREFIX src/mesa/main/texobj.c
+  $ENV{MESA_SOURCE_DIR}
+  ${MAKE_CURRENT_SOURCE_DIR}/../mesa
+  ~/mesa
+  DOC "The mesa source directory which is needed for cl_khr_gl_sharing.")
+
+IF(MESA_SOURCE_PREFIX)
+SET(MESA_SOURCE_INCLUDES ${MESA_SOURCE_PREFIX}/src/mesa
+                         ${MESA_SOURCE_PREFIX}/include
+                         ${MESA_SOURCE_PREFIX}/src/mapi
+                         ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/i965/
+                         ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/i915/
+                         ${MESA_SOURCE_PREFIX}/src/mesa/drivers/dri/common/)
+SET(MESA_SOURCE_FOUND 1 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
+ELSE(MESA_SOURCE_PREFIX)
+SET(MESA_SOURCE_FOUND 0 CACHE STRING "Set to 1 if mesa source code is found, 0 otherwise")
+ENDIF(MESA_SOURCE_PREFIX)
diff --git a/CMake/FindOCLIcd.cmake b/CMake/FindOCLIcd.cmake
new file mode 100644
index 0000000..b0a8ad7
--- /dev/null
+++ b/CMake/FindOCLIcd.cmake
@@ -0,0 +1,24 @@
+#
+# Try to find ocl_icd library and include path.
+# Once done this will define
+#
+# OCLIcd_FOUND
+# OCLIcd_INCLUDE_PATH
+#
+
+FIND_PATH(OCLIcd_INCLUDE_PATH ocl_icd.h
+  ~/include/
+  /usr/include/
+  /usr/local/include/
+  /sw/include/
+  /opt/local/include/
+  DOC "The directory where ocl_icd.h resides")
+
+IF(OCLIcd_INCLUDE_PATH)
+  INCLUDE_DIRECTORIES(${OCLIcd_INCLUDE_PATH})
+  SET(OCLIcd_FOUND 1 CACHE STRING "Set to 1 if OCLIcd is found, 0 otherwise")
+ELSE(OCLIcd_INCLUDE_PATH)
+  SET(OCLIcd_FOUND 0 CACHE STRING "Set to 1 if OCLIcd is found, 0 otherwise")
+ENDIF(OCLIcd_INCLUDE_PATH)
+
+MARK_AS_ADVANCED(OCLIcd_FOUND)
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..ac59859
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,171 @@
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
+PROJECT(OCL)
+set (LIBCL_DRIVER_VERSION_MAJOR 0)
+set (LIBCL_DRIVER_VERSION_MINOR 9)
+set (LIBCL_DRIVER_VERSION_PATCH 3)
+set (LIBCL_C_VERSION_MAJOR 1)
+set (LIBCL_C_VERSION_MINOR 2)
+
+configure_file (
+  "src/OCLConfig.h.in"
+  "src/OCLConfig.h"
+)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
+
+INCLUDE (FindPkgConfig)
+
+SET(CMAKE_VERBOSE_MAKEFILE "false")
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMake/")
+if (NOT LIB_INSTALL_DIR)
+  set (LIB_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/lib")
+endif (NOT LIB_INSTALL_DIR)
+if (NOT BEIGNET_INSTALL_DIR)
+  set (BEIGNET_INSTALL_DIR "${LIB_INSTALL_DIR}/beignet/")
+endif (NOT BEIGNET_INSTALL_DIR)
+SET(EMULATE_IVB false CACHE BOOL "To emulate IVB")
+SET(EMULATE_SNB false CACHE BOOL "To emulate SNB")
+SET(EMULATE_HSW false CACHE BOOL "To emulate HSW")
+ADD_DEFINITIONS(-D__$(USER)__)
+
+# Force Release with debug info
+if (NOT CMAKE_BUILD_TYPE)
+  set (CMAKE_BUILD_TYPE RelWithDebInfo)
+endif (NOT CMAKE_BUILD_TYPE)
+set (CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "assure config" FORCE)
+message(STATUS "Building mode: " ${CMAKE_BUILD_TYPE})
+
+SET(CMAKE_CXX_FLAGS_DEBUGO0 "-O0 -g")
+SET(CMAKE_C_FLAGS_DEBUGO0 "-O0 -g")
+
+IF (EMULATE_HSW)
+  SET (USE_FULSIM "true")
+  ADD_DEFINITIONS(-DEMULATE_GEN=75)
+ELSEIF (EMULATE_IVB)
+  SET (USE_FULSIM "true")
+  ADD_DEFINITIONS(-DEMULATE_GEN=7)
+ELSEIF (EMULATE_SNB)
+  SET (USE_FULSIM "true")
+  ADD_DEFINITIONS(-DEMULATE_GEN=6)
+ELSE (EMULATE_IVB)
+  SET (USE_FULSIM "false")
+  ADD_DEFINITIONS(-DEMULATE_GEN=0)
+ENDIF (EMULATE_HSW)
+
+# XXX now hard coded to enable the clamp to border workaround for IVB.
+ADD_DEFINITIONS(-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
+
+IF (USE_FULSIM)
+  ADD_DEFINITIONS(-DUSE_FULSIM=1)
+ELSE (USE_FULSIM)
+  ADD_DEFINITIONS(-DUSE_FULSIM=0)
+ENDIF (USE_FULSIM)
+
+SET(CMAKE_CXX_FLAGS "-Wall -Wno-invalid-offsetof -mfpmath=sse -fno-rtti -Wcast-align -std=c++0x -msse2 -msse3 -mssse3 -msse4.1 ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_C_FLAGS "-Wall -mfpmath=sse -msse2 -Wcast-align -msse2 -msse3 -mssse3 -msse4.1 ${CMAKE_C_FLAGS}")
+
+# Front end stuff we need
+#INCLUDE(CMake/FindLLVM.cmake)
+Find_Package(LLVM 3.3)
+
+# XLib
+Find_Package(X11)
+IF(X11_FOUND)
+  MESSAGE(STATUS "Looking for XLib - found")
+ELSE(X11_FOUND)
+  MESSAGE(STATUS "Looking for XLib - not found")
+ENDIF(X11_FOUND)
+
+# DRM
+pkg_check_modules(DRM REQUIRED libdrm)
+IF(DRM_FOUND)
+  MESSAGE(STATUS "Looking for DRM - found at ${DRM_PREFIX}")
+  INCLUDE_DIRECTORIES(${DRM_INCLUDE_DIRS})
+ELSE(DRM_FOUND)
+  MESSAGE(STATUS "Looking for DRM - not found")
+ENDIF(DRM_FOUND)
+
+# DRM Intel
+pkg_check_modules(DRM_INTEL REQUIRED libdrm_intel)
+IF(DRM_INTEL_FOUND)
+  INCLUDE_DIRECTORIES(${DRM_INTEL_INCLUDE_DIRS})
+  MESSAGE(STATUS "Looking for DRM Intel - found at ${DRM_INTEL_PREFIX}")
+ELSE(DRM_INTEL_FOUND)
+  MESSAGE(STATUS "Looking for DRM Intel - not found")
+ENDIF(DRM_INTEL_FOUND)
+
+# Threads
+Find_Package(Threads)
+
+IF(X11_FOUND)
+# OpenGL (not use cmake helper)
+pkg_check_modules(OPENGL gl)
+IF(OPENGL_FOUND)
+  INCLUDE_DIRECTORIES(${OPENGL_INCLUDE_DIRS})
+  MESSAGE(STATUS "Looking for OpenGL - found at ${OPENGL_PREFIX}")
+ELSE(OPENGL_FOUND)
+  MESSAGE(STATUS "Looking for OpenGL - not found")
+ENDIF(OPENGL_FOUND)
+
+# Xext
+pkg_check_modules(XEXT REQUIRED xext)
+IF(XEXT_FOUND)
+  INCLUDE_DIRECTORIES(${XEXT_INCLUDE_DIRS})
+  MESSAGE(STATUS "Looking for Xext - found at ${XEXT_PREFIX}")
+ELSE(XEXT_FOUND)
+  MESSAGE(STATUS "Looking for Xext - not found")
+ENDIF(XEXT_FOUND)
+
+# Xfixes
+pkg_check_modules(XFIXES REQUIRED xfixes)
+IF(XFIXES_FOUND)
+  INCLUDE_DIRECTORIES(${XFIXES_INCLUDE_DIRS})
+  MESSAGE(STATUS "Looking for Xfixes - found at ${XFIXES_PREFIX}")
+ELSE(XFIXES_FOUND)
+  MESSAGE(STATUS "Looking for Xfixes - not found")
+ENDIF(XFIXES_FOUND)
+ENDIF(X11_FOUND)
+
+pkg_check_modules(EGL egl)
+IF(EGL_FOUND)
+  MESSAGE(STATUS "Looking for EGL - found at ${EGL_PREFIX}")
+ELSE(EGL_FOUND)
+  MESSAGE(STATUS "Looking for EGL - not found")
+ENDIF(EGL_FOUND)
+
+# cl_khr_gl_sharing requires to build with mesa source
+Find_Package(MesaSrc)
+IF(MESA_SOURCE_FOUND)
+  MESSAGE(STATUS "Looking for mesa source code - found at ${MESA_SOURCE_PREFIX}")
+ELSE(MESA_SOURCE_FOUND)
+  MESSAGE(STATUS "Looking for mesa source code - not found, cl_khr_gl_sharing will be disabled.")
+ENDIF(MESA_SOURCE_FOUND)
+
+Find_Package(OCLIcd)
+IF(OCLIcd_FOUND)
+  MESSAGE(STATUS "Looking for OCL ICD header file - found")
+  configure_file (
+    "intel-beignet.icd.in"
+    "intel-beignet.icd"
+  )
+  install (FILES ${CMAKE_CURRENT_BINARY_DIR}/intel-beignet.icd DESTINATION /etc/OpenCL/vendors)
+ELSE(OCLIcd_FOUND)
+  MESSAGE(STATUS "Looking for OCL ICD header file - not found")
+ENDIF(OCLIcd_FOUND)
+
+Find_Package(PythonInterp)
+
+ADD_SUBDIRECTORY(include)
+ADD_SUBDIRECTORY(backend)
+ADD_SUBDIRECTORY(src)
+ADD_SUBDIRECTORY(utests)
+ADD_SUBDIRECTORY(benchmark)
+
+SET(CPACK_PACKAGE_VERSION_MAJOR "${LIBCL_DRIVER_VERSION_MAJOR}")
+SET(CPACK_PACKAGE_VERSION_MINOR "${LIBCL_DRIVER_VERSION_MINOR}")
+SET(CPACK_PACKAGE_VERSION_PATCH "${LIBCL_DRIVER_VERSION_PATCH}")
+SET(CPACK_SOURCE_GENERATOR "TGZ;TZ")
+SET(CPACK_PACKAGE_NAME "Beignet")
+SET(CPACK_PACKAGE_VENDOR "Intel Open Source Technology Center")
+INCLUDE(CPack)
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..4362b49
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,502 @@
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+

+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+

+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+

+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+

+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+

+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+

+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+

+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+

+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+

+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/NEWS.mdwn b/NEWS.mdwn
new file mode 120000
index 0000000..dc4cb4b
--- /dev/null
+++ b/NEWS.mdwn
@@ -0,0 +1 @@
+docs/NEWS.mdwn
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 120000
index 0000000..b9f23a8
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+docs/Beignet.mdwn
\ No newline at end of file
diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
new file mode 100644
index 0000000..6a31c68
--- /dev/null
+++ b/backend/CMakeLists.txt
@@ -0,0 +1,108 @@
+project (GBE)
+set (LIBGBE_VERSION_MAJOR 0)
+set (LIBGBE_VERSION_MINOR 2)
+cmake_minimum_required (VERSION 2.6.0)
+
+set (GBE_CMAKE_DIR "${GBE_SOURCE_DIR}/cmake")
+set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${GBE_CMAKE_DIR}")
+
+##############################################################
+# Compilation directives
+##############################################################
+
+set (GBE_DEBUG_MEMORY false CACHE bool "Activate the memory debugger")
+set (GBE_USE_BLOB false CACHE bool "Compile everything from one big file")
+
+##############################################################
+# Compiler
+##############################################################
+if (UNIX)
+  set (COMPILER "GCC" CACHE INT "Compiler to choose on Linux (GCC,ICC,CLANG)")
+endif (UNIX)
+
+# Force Release with debug info
+if (NOT CMAKE_BUILD_TYPE)
+  set (CMAKE_BUILD_TYPE RelWithDebInfo)
+endif (NOT CMAKE_BUILD_TYPE)
+set (CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "assure config" FORCE)
+message(STATUS "Building mode: " ${CMAKE_BUILD_TYPE})
+
+if (GBE_DEBUG_MEMORY)
+  set (GBE_DEBUG_MEMORY_FLAG "-DGBE_DEBUG_MEMORY=1")
+else (GBE_DEBUG_MEMORY)
+  set (GBE_DEBUG_MEMORY_FLAG "-DGBE_DEBUG_MEMORY=0")
+endif (GBE_DEBUG_MEMORY)
+
+# Hide all symbols and allows the symbols declared as visible to be exported
+set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden -DGBE_COMPILER_AVAILABLE=1 ${CMAKE_C_CXX_FLAGS}")
+
+if (COMPILER STREQUAL "GCC")
+  set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -funroll-loops -Wstrict-aliasing=2 -fstrict-aliasing -msse2 -msse3 -mssse3 -msse4.1 -fPIC -Wall")
+  set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS}  ${LLVM_CFLAGS}")
+  set (CMAKE_CXX_FLAGS "${CMAKE_C_CXX_FLAGS}  -Wno-invalid-offsetof -fno-rtti -std=c++0x")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-E")
+  set (CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-undefined ${LLVM_LFLAGS}")
+  set (CMAKE_CXX_FLAGS_DEBUG          "-g -DGBE_DEBUG=1")
+  set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
+  set (CMAKE_CXX_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
+  set (CMAKE_CXX_FLAGS_RELEASE        "-O2 -DNDEBUG -DGBE_DEBUG=0")
+  set (CMAKE_C_FLAGS "${CMAKE_C_CXX_FLAGS}")
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wl,-E")
+  set (CMAKE_C_FLAGS_DEBUG          "-g -DGBE_DEBUG=1")
+  set (CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
+  set (CMAKE_C_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
+  set (CMAKE_C_FLAGS_RELEASE        "-O2 -DNDEBUG -DGBE_DEBUG=0")
+elseif (COMPILER STREQUAL "CLANG")
+  set (CMAKE_C_COMPILER             "clang")
+  set (CMAKE_C_FLAGS                "-Wall -std=c99")
+  set (CMAKE_C_FLAGS_DEBUG          "-g -DGBE_DEBUG=1")
+  set (CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
+  set (CMAKE_C_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
+  set (CMAKE_C_FLAGS_RELEASE        "-O2 -DNDEBUG -DGBE_DEBUG=0")
+  set (CMAKE_CXX_COMPILER             "clang++")
+  set (CMAKE_CXX_FLAGS "-fstrict-aliasing -msse2 -fPIC -Wall -Wno-format-security -Wno-invalid-offsetof -std=c++0x")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${VISIBILITY_FLAG}")
+  set (CMAKE_CXX_FLAGS_DEBUG          "-g -DGBE_DEBUG=1")
+  set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
+  set (CMAKE_CXX_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
+  set (CMAKE_CXX_FLAGS_RELEASE        "-O2 -DNDEBUG -DGBE_DEBUG=0")
+  set (CMAKE_AR      "/usr/bin/llvm-ar")
+  set (CMAKE_LINKER  "/usr/bin/llvm-ld")
+  set (CMAKE_NM      "/usr/bin/llvm-nm")
+  set (CMAKE_OBJDUMP "/usr/bin/llvm-objdump")
+  set (CMAKE_RANLIB  "ranlib")
+elseif (COMPILER STREQUAL "ICC")
+  set (CMAKE_CXX_COMPILER "icpc")
+  set (CMAKE_C_COMPILER "icc")
+  set (CMAKE_CXX_FLAGS "-std=c++0x -wd2928 -Wall -fPIC -fstrict-aliasing -fp-model fast -xSSE2")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${VISIBILITY_FLAG} -Wl,-E")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MODE_FLAG}")
+  set (CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DGBE_DEBUG=1")
+  set (CCMAKE_CXX_FLAGS_RELWITHDEBINFO "-g -O2 -DGBE_DEBUG=1")
+  set (CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG -O2 -DGBE_DEBUG=0")
+  set (CCMAKE_CXX_FLAGS_MINSIZEREL "-Os -DGBE_DEBUG=0")
+  set (CMAKE_EXE_LINKER_FLAGS "")
+endif ()
+
+include_directories (${CMAKE_CURRENT_BINARY_DIR})
+##############################################################
+# Project source code
+##############################################################
+add_subdirectory (src)
+set(LOCAL_PCH_OBJECT_DIR ${LOCAL_PCH_OBJECT_DIR} PARENT_SCOPE)
+set(LOCAL_PCM_OBJECT_DIR ${LOCAL_PCM_OBJECT_DIR} PARENT_SCOPE)
+set(LOCAL_GBE_OBJECT_DIR ${LOCAL_GBE_OBJECT_DIR} PARENT_SCOPE)
+set(LOCAL_INTERP_OBJECT_DIR ${LOCAL_INTERP_OBJECT_DIR} PARENT_SCOPE)
+
+set (GBE_BIN_GENERATER
+     OCL_PCM_PATH=${LOCAL_PCM_OBJECT_DIR} OCL_PCH_PATH=${LOCAL_PCH_OBJECT_DIR} LD_LIBRARY_PATH=${CMAKE_CURRENT_BINARY_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src/gbe_bin_generater
+     PARENT_SCOPE)
+
diff --git a/backend/kernels/compile.sh b/backend/kernels/compile.sh
new file mode 100755
index 0000000..f6bb834
--- /dev/null
+++ b/backend/kernels/compile.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+clang -emit-llvm -O3 -target nvptx -c $1 -o $1.o
+llvm-dis $1.o
+rm $1.o
+mv $1.o.ll $1.ll
+
diff --git a/backend/src/.gitignore b/backend/src/.gitignore
new file mode 100644
index 0000000..d0ee832
--- /dev/null
+++ b/backend/src/.gitignore
@@ -0,0 +1,7 @@
+GBEConfig.h
+libgbe.so
+ocl_common_defines_str.cpp
+ocl_stdlib.h
+ocl_stdlib.h.pch
+ocl_stdlib_str.cpp
+ocl_vector.h
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
new file mode 100644
index 0000000..a3818ab
--- /dev/null
+++ b/backend/src/CMakeLists.txt
@@ -0,0 +1,236 @@
+set (ocl_vector_spec_file ${GBE_SOURCE_DIR}/src/builtin_vector_proto.def)
+set (ocl_vector_file ${GBE_SOURCE_DIR}/src/ocl_vector.h)
+set (ocl_as_file ${GBE_SOURCE_DIR}/src/ocl_as.h)
+set (ocl_convert_file ${GBE_SOURCE_DIR}/src/ocl_convert.h)
+set (ocl_stdlib_tmpl_file ${GBE_SOURCE_DIR}/src/ocl_stdlib.tmpl.h)
+set (ocl_common_header_file ${GBE_SOURCE_DIR}/src/ocl_common_defines.h)
+set (ocl_blob_file ${CMAKE_CURRENT_BINARY_DIR}${BEIGNET_INSTALL_DIR}ocl_stdlib.h)
+set (ocl_blob_cpp_file ${GBE_SOURCE_DIR}/src/ocl_stdlib_str.cpp)
+set (ocl_gen_blob_cmd ${GBE_SOURCE_DIR}/src/update_blob_ocl_header.py)
+set (ocl_gen_vector_cmd ${GBE_SOURCE_DIR}/src/gen_builtin_vector.py)
+
+set (string_header "\\\"string\\\"")
+add_custom_command(
+    OUTPUT ${ocl_blob_cpp_file}
+    COMMAND rm -rf ${ocl_blob_cpp_file}
+    COMMAND echo "\\\#include ${string_header}" >> ${ocl_blob_cpp_file}
+    COMMAND echo "namespace gbe {" >> ${ocl_blob_cpp_file}
+    COMMAND echo "std::string ocl_stdlib_str = " >> ${ocl_blob_cpp_file}
+    # Yeah!!! welcome to back slash hell
+    COMMAND cat ${ocl_blob_file} |sed 's/\\\\/\\\\\\\\/g' | sed 's/\\\"/\\\\\\\"/g' | awk '{ printf \(\"\\"%s\\\\n\\"\\n\", $$0\) }' >> ${ocl_blob_cpp_file}
+    COMMAND echo "\;" >> ${ocl_blob_cpp_file}
+    COMMAND echo "}" >> ${ocl_blob_cpp_file}
+    COMMAND echo "" >> ${ocl_blob_cpp_file}
+    DEPENDS ${ocl_blob_file})
+
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "ocl_vector.h;${ocl_blob_file}")
+
+add_custom_command(
+  OUTPUT ${ocl_vector_file}
+  COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_vector_cmd} ${ocl_vector_spec_file} ${ocl_vector_file}
+  DEPENDS ${ocl_gen_vector_cmd} ${ocl_vector_spec_file}
+  )
+
+add_custom_command(
+  OUTPUT ${ocl_blob_file}
+  COMMAND mkdir -p ${CMAKE_CURRENT_BINARY_DIR}/${BEIGNET_INSTALL_DIR}
+  COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_blob_file}
+  DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_common_header_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file}
+  )
+
+set (pch_object ${ocl_blob_file}.pch)
+set (local_pch_object ${ocl_blob_file}.local.pch)
+# generate pch object
+if (LLVM_VERSION_NODOT VERSION_GREATER 32)
+    set (clang_cmd -cc1 -x cl -triple spir -ffp-contract=off -cl-kernel-arg-info)
+else (LLVM_VERSION_NODOT VERSION_GREATER 32)
+    if (LLVM_VERSION_NODOT VERSION_GREATER 31)
+        set (clang_cmd -cc1 -x cl -triple nvptx -ffp-contract=off)
+    else (LLVM_VERSION_NODOT VERSION_GREATER 31)
+        set (clang_cmd -cc1 -x cl -triple ptx32)
+    endif (LLVM_VERSION_NODOT VERSION_GREATER 31)
+endif (LLVM_VERSION_NODOT VERSION_GREATER 32)
+set (clang_cmd ${clang_cmd} -cl-std=CL1.2 -fno-builtin -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
+
+add_custom_command(
+     OUTPUT ${pch_object}
+     COMMAND rm -f ${pch_object}
+     COMMAND ${LLVM_INSTALL_DIR}clang ${clang_cmd} --relocatable-pch -emit-pch -isysroot ${CMAKE_CURRENT_BINARY_DIR} ${ocl_blob_file} -o ${pch_object}
+     COMMAND ${LLVM_INSTALL_DIR}clang ${clang_cmd} -emit-pch ${ocl_blob_file} -o ${local_pch_object}
+     DEPENDS ${ocl_blob_file}
+     )
+
+add_custom_target(pch_object
+                  DEPENDS ${pch_object})
+
+macro(ll_add_library ll_lib ll_sources)
+  foreach (ll ${${ll_sources}})
+  add_custom_command(
+       OUTPUT  ${ll}.bc
+       COMMAND rm -f ${ll}.bc
+       COMMAND ${LLVM_INSTALL_DIR}llvm-as -o ${ll}.bc ${GBE_SOURCE_DIR}/src/${ll}
+       DEPENDS ${ll}
+       )
+  set (ll_objects ${ll_objects} ${ll}.bc)
+  endforeach (ll ${ll_sources})
+  add_custom_command(
+       OUTPUT ${ll_lib}
+       COMMAND ${LLVM_INSTALL_DIR}llvm-link -o ${ll_lib} ${ll_objects}
+       DEPENDS ${ll_objects}
+       )
+  add_custom_target(${ll_lib}
+                    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${ll_lib})
+endmacro(ll_add_library)
+
+if (GBE_USE_BLOB)
+  set (GBE_SRC
+       blob.cpp
+       backend/gen/gen_mesa_disasm.c)
+else (GBE_USE_BLOB)
+  set (GBE_SRC
+    ${ocl_blob_file}
+    ocl_stdlib_str.cpp  # this file is auto-generated.
+    sys/vector.hpp
+    sys/hash_map.hpp
+    sys/map.hpp
+    sys/set.hpp
+    sys/intrusive_list.hpp
+    sys/intrusive_list.cpp
+    sys/exception.hpp
+    sys/assert.cpp
+    sys/assert.hpp
+    sys/alloc.cpp
+    sys/alloc.hpp
+    sys/mutex.cpp
+    sys/mutex.hpp
+    sys/platform.cpp
+    sys/platform.hpp
+    sys/cvar.cpp
+    sys/cvar.hpp
+    ir/context.cpp
+    ir/context.hpp
+    ir/profile.cpp
+    ir/profile.hpp
+    ir/type.cpp
+    ir/type.hpp
+    ir/unit.cpp
+    ir/unit.hpp
+    ir/constant.cpp
+    ir/constant.hpp
+    ir/sampler.cpp
+    ir/sampler.hpp
+    ir/image.cpp
+    ir/image.hpp
+    ir/instruction.cpp
+    ir/instruction.hpp
+    ir/liveness.cpp
+    ir/register.cpp
+    ir/register.hpp
+    ir/function.cpp
+    ir/function.hpp
+    ir/value.cpp
+    ir/value.hpp
+    ir/lowering.cpp
+    ir/lowering.hpp
+    ir/printf.cpp
+    ir/printf.hpp
+    ir/immediate.hpp
+    ir/immediate.cpp
+    backend/context.cpp
+    backend/context.hpp
+    backend/program.cpp
+    backend/program.hpp
+    backend/program.h
+    llvm/llvm_gen_backend.cpp
+    llvm/llvm_passes.cpp
+    llvm/llvm_scalarize.cpp
+    llvm/llvm_intrinsic_lowering.cpp
+    llvm/llvm_barrier_nodup.cpp
+    llvm/llvm_printf_parser.cpp
+    llvm/llvm_to_gen.cpp
+    llvm/llvm_loadstore_optimization.cpp
+    llvm/llvm_gen_backend.hpp
+    llvm/llvm_gen_ocl_function.hxx
+    llvm/llvm_to_gen.hpp
+    backend/gen/gen_mesa_disasm.c
+    backend/gen_insn_selection.cpp
+    backend/gen_insn_selection.hpp
+    backend/gen_insn_scheduling.cpp
+    backend/gen_insn_scheduling.hpp
+    backend/gen_reg_allocation.cpp
+    backend/gen_reg_allocation.hpp
+    backend/gen_context.cpp
+    backend/gen_context.cpp
+    backend/gen75_context.hpp
+    backend/gen75_context.cpp
+    backend/gen_program.cpp
+    backend/gen_program.hpp
+    backend/gen_program.h
+    backend/gen_defs.hpp
+    backend/gen_insn_compact.cpp
+    backend/gen_encoder.hpp
+    backend/gen_encoder.cpp
+    backend/gen75_encoder.hpp
+    backend/gen75_encoder.cpp
+    )
+
+endif (GBE_USE_BLOB)
+
+include_directories (.)
+link_directories (${LLVM_LIBRARY_DIRS} ${DRM_LIBDIR})
+include_directories(${LLVM_INCLUDE_DIRS})
+add_library (gbe SHARED ${GBE_SRC})
+
+# for pre compiled module library.
+set (pcm_lib "beignet.bc")
+set (pcm_sources ocl_barrier.ll ocl_memset.ll ocl_memcpy.ll)
+ll_add_library (${pcm_lib} pcm_sources)
+
+ADD_DEPENDENCIES (gbe pch_object ${pcm_lib})
+target_link_libraries(
+                      gbe
+                      ${DRM_INTEL_LIBRARIES}
+                      ${DRM_LIBRARIES}
+                      ${CLANG_LIBRARIES}
+                      ${LLVM_MODULE_LIBS}
+                      ${LLVM_SYSTEM_LIBS}
+                      ${CMAKE_THREAD_LIBS_INIT}
+                      ${CMAKE_DL_LIBS})
+
+add_library(gbeinterp SHARED gbe_bin_interpreter.cpp)
+
+if (LLVM_VERSION_NODOT VERSION_EQUAL 34)
+  find_library(TERMINFO NAMES tinfo ncurses)
+  if (${TERMINFO} STREQUAL TERMINFO-NOTFOUND)
+    message(FATAL_ERROR "no libtinfo or libncurses is found in system")
+  else (${TERMINFO} STREQUAL TERMINFO-NOTFOUND)
+    target_link_libraries(gbe ${TERMINFO})
+    message(STATUS "use ${TERMINFO} as terminal control library")
+  endif (${TERMINFO} STREQUAL TERMINFO-NOTFOUND)
+endif(LLVM_VERSION_NODOT VERSION_EQUAL 34)
+
+link_directories (${LLVM_LIBRARY_DIR} ${DRM_LIBDIR})
+ADD_EXECUTABLE(gbe_bin_generater gbe_bin_generater.cpp)
+TARGET_LINK_LIBRARIES(gbe_bin_generater gbe)
+
+install (TARGETS gbe LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
+install (TARGETS gbeinterp LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
+#install (FILES backend/program.h DESTINATION include/gen)
+install (FILES ${ocl_blob_file} DESTINATION ${BEIGNET_INSTALL_DIR})
+install (FILES ${pch_object} DESTINATION ${BEIGNET_INSTALL_DIR})
+install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${pcm_lib} DESTINATION ${BEIGNET_INSTALL_DIR})
+# When build beignet itself, we need to export the local precompiled header file and precompiled module
+# file to libcl and utests.
+set (LOCAL_PCH_OBJECT_DIR "${local_pch_object}:${BEIGNET_INSTALL_DIR}/ocl_stdlib.h.pch" PARENT_SCOPE)
+set (LOCAL_PCM_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/${pcm_lib}:${BEIGNET_INSTALL_DIR}/${pcm_lib}" PARENT_SCOPE)
+set (LOCAL_GBE_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbe.so" PARENT_SCOPE)
+set (LOCAL_INTERP_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbeinterp.so" PARENT_SCOPE)
+
+set (PCH_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/ocl_stdlib.h.pch")
+set (PCM_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/${pcm_lib}")
+set (GBE_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbe.so")
+set (INTERP_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbeinterp.so")
+configure_file (
+  "GBEConfig.h.in"
+  "GBEConfig.h"
+)
diff --git a/backend/src/GBEConfig.h.in b/backend/src/GBEConfig.h.in
new file mode 100644
index 0000000..f5c69c6
--- /dev/null
+++ b/backend/src/GBEConfig.h.in
@@ -0,0 +1,7 @@
+// the configured options and settings for LIBGBE
+#define LIBGBE_VERSION_MAJOR @LIBGBE_VERSION_MAJOR@
+#define LIBGBE_VERSION_MINOR @LIBGBE_VERSION_MINOR@
+#define PCH_OBJECT_DIR "@PCH_OBJECT_DIR@"
+#define PCM_OBJECT_DIR "@PCM_OBJECT_DIR@"
+#define GBE_OBJECT_DIR "@GBE_OBJECT_DIR@"
+#define INTERP_OBJECT_DIR "@INTERP_OBJECT_DIR@"
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
new file mode 100644
index 0000000..e09a309
--- /dev/null
+++ b/backend/src/backend/context.cpp
@@ -0,0 +1,585 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file context.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "backend/context.hpp"
+#include "backend/program.hpp"
+#include "backend/gen_encoder.hpp"
+#include "ir/unit.hpp"
+#include "ir/function.hpp"
+#include "ir/profile.hpp"
+#include "ir/liveness.hpp"
+#include "ir/value.hpp"
+#include "ir/image.hpp"
+#include "sys/cvar.hpp"
+#include <algorithm>
+
+namespace gbe
+{
+  class SimpleAllocator
+  {
+  public:
+    SimpleAllocator(int16_t startOffset, int16_t size, bool _assertFail);
+    ~SimpleAllocator(void);
+
+    /*! Allocate some memory from the pool.
+     */
+    int16_t allocate(int16_t size, int16_t alignment, bool bFwd=false);
+
+    /*! Free the given register file piece */
+    void deallocate(int16_t offset);
+
+    /*! Spilt a block into 2 blocks */
+    void splitBlock(int16_t offset, int16_t subOffset);
+
+  protected:
+    /*! Double chained list of free spaces */
+    struct Block {
+      Block(int16_t offset, int16_t size) :
+        prev(NULL), next(NULL), offset(offset), size(size) {}
+      Block *prev, *next; //!< Previous and next free blocks
+      int16_t offset;        //!< Where the free block starts
+      int16_t size;          //!< Size of the free block
+    };
+
+    /*! Try to coalesce two blocks (left and right). They must be in that order.
+     *  If the colascing was done, the left block is deleted
+     */
+    void coalesce(Block *left, Block *right);
+    /*! the maximum offset */
+    int16_t maxOffset;
+    /*! whether trigger an assertion on allocation failure */
+    bool assertFail;
+    /*! Head and tail of the free list */
+    Block *head;
+    Block *tail;
+    /*! Handle free list element allocation */
+    DECL_POOL(Block, blockPool);
+    /*! Track allocated memory blocks <offset, size> */
+    map<int16_t, int16_t> allocatedBlocks;
+    /*! Use custom allocators */
+    GBE_CLASS(SimpleAllocator);
+  };
+
+  /*! Structure that keeps track of allocation in the register file. This is
+   *  actually needed by Context (and not only by GenContext) because both
+   *  simulator and hardware have to deal with constant pushing which uses the
+   *  register file
+   *
+   *  Since Gen is pretty flexible, we just reuse the Simpleallocator
+   */
+
+  class RegisterAllocator: public SimpleAllocator {
+  public:
+    RegisterAllocator(int16_t offset, int16_t size): SimpleAllocator(offset, size, false) {}
+
+    GBE_CLASS(RegisterAllocator);
+  };
+
+  /*!
+   * an allocator for scratch memory allocation. Scratch memory are used for register spilling.
+   * You can query how much scratch memory needed through getMaxScatchMemUsed().
+   */
+
+  class ScratchAllocator: public SimpleAllocator {
+  public:
+    ScratchAllocator(int16_t size): SimpleAllocator(0, size, true) {}
+    int16_t getMaxScatchMemUsed() { return maxOffset; }
+
+    GBE_CLASS(ScratchAllocator);
+  };
+
+  SimpleAllocator::SimpleAllocator(int16_t startOffset,
+                                   int16_t size,
+                                   bool _assertFail)
+                                  : maxOffset(0),
+                                  assertFail(_assertFail){
+    tail = head = this->newBlock(startOffset, size);
+  }
+
+  SimpleAllocator::~SimpleAllocator(void) {
+    while (this->head) {
+      Block *next = this->head->next;
+      this->deleteBlock(this->head);
+      this->head = next;
+    }
+  }
+
+  int16_t SimpleAllocator::allocate(int16_t size, int16_t alignment, bool bFwd)
+  {
+    // Make it simple and just use the first block we find
+    Block *list = bFwd ? head : tail;
+    while (list) {
+      int16_t aligned;
+      int16_t spaceOnLeft;
+      int16_t spaceOnRight;
+      if(bFwd) {
+        aligned = ALIGN(list->offset, alignment);
+        spaceOnLeft = aligned - list->offset;
+        spaceOnRight = list->size - size - spaceOnLeft;
+
+      // Not enough space in this block
+        if (spaceOnRight < 0) {
+          list = list->next;
+          continue;
+        }
+      } else {
+        int16_t unaligned = list->offset + list->size - size - (alignment-1);
+        if(unaligned < 0) {
+          list = list->prev;
+          continue;
+        }
+        aligned = ALIGN(unaligned, alignment);   //alloc from block's tail
+        spaceOnLeft = aligned - list->offset;
+        spaceOnRight = list->size - size - spaceOnLeft;
+
+        // Not enough space in this block
+        if (spaceOnLeft < 0) {
+          list = list->prev;
+          continue;
+        }
+      }
+
+      // Cool we can use this block
+      Block *left = list->prev;
+      Block *right = list->next;
+
+      // If we left a hole on the left, create a new block
+      if (spaceOnLeft) {
+        Block *newBlock = this->newBlock(list->offset, spaceOnLeft);
+        if (left) {
+          left->next = newBlock;
+          newBlock->prev = left;
+        }
+        if (right) {
+          newBlock->next = right;
+          right->prev = newBlock;
+        }
+        left = newBlock;
+      }
+
+      // If we left a hole on the right, create a new block as well
+      if (spaceOnRight) {
+        Block *newBlock = this->newBlock(aligned + size, spaceOnRight);
+        if (left) {
+          left->next = newBlock;
+          newBlock->prev = left;
+        }
+        if (right) {
+          right->prev = newBlock;
+          newBlock->next = right;
+        }
+        right = newBlock;
+      }
+
+      // Chain both successors and predecessors when the entire block was
+      // allocated
+      if (spaceOnLeft == 0 && spaceOnRight == 0) {
+        if (left) left->next = right;
+        if (right) right->prev = left;
+      }
+
+      // Update the head of the free blocks
+      if (list == head) {
+        if (left)
+          head = left;
+        else if (right)
+          head = right;
+        else
+          head = NULL;
+      }
+
+      // Update the tail of the free blocks
+      if (list == tail) {
+        if (right)
+          tail = right;
+        else if (left)
+          tail = left;
+        else
+          tail = NULL;
+      }
+      // Free the block and check the consistency
+      this->deleteBlock(list);
+      if (head && head->next) GBE_ASSERT(head->next->prev == head);
+      if (tail && tail->prev) GBE_ASSERT(tail->prev->next == tail);
+
+      // Track the allocation to retrieve the size later
+      allocatedBlocks.insert(std::make_pair(aligned, size));
+      // update max offset
+      if(aligned + size > maxOffset) maxOffset = aligned + size;
+      // We have a valid offset now
+      return aligned;
+    }
+    GBE_ASSERT( !assertFail );
+    return 0;
+  }
+
+  void SimpleAllocator::deallocate(int16_t offset)
+  {
+    // Retrieve the size in the allocation map
+    auto it = allocatedBlocks.find(offset);
+    GBE_ASSERT(it != allocatedBlocks.end());
+    const int16_t size = it->second;
+
+    // Find the two blocks where to insert the new block
+    Block *list = tail, *next = NULL;
+    while (list != NULL) {
+      if (list->offset < offset)
+        break;
+      next = list;
+      list = list->prev;
+    }
+
+    // Create the block and insert it
+    Block *newBlock = this->newBlock(offset, size);
+    if (list) {
+      GBE_ASSERT(list->offset + list->size <= offset);
+      list->next = newBlock;
+      newBlock->prev = list;
+    } else
+      this->head = newBlock;  // list is NULL means newBlock should be the head.
+
+    if (next) {
+      GBE_ASSERT(offset + size <= next->offset);
+      next->prev = newBlock;
+      newBlock->next = next;
+    } else
+      this->tail = newBlock;  // next is NULL means newBlock should be the tail.
+
+    if (list != NULL || next != NULL)
+    {
+      // Coalesce the blocks if possible
+      this->coalesce(list, newBlock);
+      this->coalesce(newBlock, next);
+    }
+
+    // Do not track this allocation anymore
+    allocatedBlocks.erase(it);
+  }
+
+  void SimpleAllocator::coalesce(Block *left, Block *right) {
+    if (left == NULL || right == NULL) return;
+    GBE_ASSERT(left->offset < right->offset);
+    GBE_ASSERT(left->next == right);
+    GBE_ASSERT(right->prev == left);
+    if (left->offset + left->size == right->offset) {
+      right->offset = left->offset;
+      right->size += left->size;
+      if (left->prev) left->prev->next = right;
+      right->prev = left->prev;
+      if (left == this->head)
+        this->head = right;
+      this->deleteBlock(left);
+    }
+  }
+
+  void SimpleAllocator::splitBlock(int16_t offset, int16_t subOffset) {
+    // Retrieve the size in the allocation map
+    auto it = allocatedBlocks.find(offset);
+    GBE_ASSERT(it != allocatedBlocks.end());
+
+    while(subOffset > it->second) {
+      subOffset -= it->second;
+      offset += it->second;
+      it = allocatedBlocks.find(offset);
+      GBE_ASSERT(it != allocatedBlocks.end());
+    }
+
+    if(subOffset == 0)
+      return;
+    int16_t size = it->second;
+    allocatedBlocks.erase(it);
+    // Track the allocation to retrieve the size later
+    allocatedBlocks.insert(std::make_pair(offset, subOffset));
+    allocatedBlocks.insert(std::make_pair(offset + subOffset, size - subOffset));
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Generic Context (shared by the simulator and the HW context)
+  ///////////////////////////////////////////////////////////////////////////
+  IVAR(OCL_SIMD_WIDTH, 8, 15, 16);
+
+  Context::Context(const ir::Unit &unit, const std::string &name) :
+    unit(unit), fn(*unit.getFunction(name)), name(name), liveness(NULL), dag(NULL)
+  {
+    GBE_ASSERT(unit.getPointerSize() == ir::POINTER_32_BITS);
+    this->liveness = GBE_NEW(ir::Liveness, const_cast<ir::Function&>(fn));
+    this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
+    // r0 (GEN_REG_SIZE) is always set by the HW and used at the end by EOT
+    this->registerAllocator = NULL; //GBE_NEW(RegisterAllocator, GEN_REG_SIZE, 4*KB - GEN_REG_SIZE);
+    this->scratchAllocator = NULL; //GBE_NEW(ScratchAllocator, 12*KB);
+  }
+
+  Context::~Context(void) {
+    GBE_SAFE_DELETE(this->registerAllocator);
+    GBE_SAFE_DELETE(this->scratchAllocator);
+    GBE_SAFE_DELETE(this->dag);
+    GBE_SAFE_DELETE(this->liveness);
+  }
+
+  void Context::startNewCG(uint32_t simdWidth) {
+    if (simdWidth == 0 || OCL_SIMD_WIDTH != 15)
+      this->simdWidth = nextHighestPowerOf2(OCL_SIMD_WIDTH);
+    else
+      this->simdWidth = simdWidth;
+    GBE_SAFE_DELETE(this->registerAllocator);
+    GBE_SAFE_DELETE(this->scratchAllocator);
+    GBE_ASSERT(dag != NULL && liveness != NULL);
+    this->registerAllocator = GBE_NEW(RegisterAllocator, GEN_REG_SIZE, 4*KB - GEN_REG_SIZE);
+    this->scratchAllocator = GBE_NEW(ScratchAllocator, this->getScratchSize());
+    this->curbeRegs.clear();
+    this->JIPs.clear();
+  }
+
+  Kernel *Context::compileKernel(void) {
+    this->kernel = this->allocateKernel();
+    this->kernel->simdWidth = this->simdWidth;
+    this->buildArgList();
+    if (usedLabels.size() == 0)
+      this->buildUsedLabels();
+    if (JIPs.size() == 0)
+      this->buildJIPs();
+    this->buildStack();
+    this->handleSLM();
+    if (this->emitCode() == false) {
+      GBE_DELETE(this->kernel);
+      this->kernel = NULL;
+    }
+    if(this->kernel != NULL) {
+      this->kernel->scratchSize = this->alignScratchSize(scratchAllocator->getMaxScatchMemUsed());
+      this->kernel->ctx = this;
+    }
+    return this->kernel;
+  }
+
+  int16_t Context::allocate(int16_t size, int16_t alignment) {
+    return registerAllocator->allocate(size, alignment);
+  }
+
+  void Context::deallocate(int16_t offset) { registerAllocator->deallocate(offset); }
+
+  void Context::splitBlock(int16_t offset, int16_t subOffset) {
+    registerAllocator->splitBlock(offset, subOffset);
+  }
+
+  // FIXME TODO as we optimize scratch memory usage using the register interval.
+  // we need to add some dependency in post_reg_alloc scheduler, to keep scratch
+  // memory that are reused still keep the order
+
+  int32_t Context::allocateScratchMem(uint32_t size) {
+    return scratchAllocator->allocate(size, 32, true);
+  }
+  void Context::deallocateScratchMem(int32_t offset) {
+    scratchAllocator->deallocate(offset);
+  }
+
+  void Context::buildStack(void) {
+    const auto &stackUse = dag->getUse(ir::ocl::stackptr);
+    if (stackUse.size() == 0)  // no stack is used if stackptr is unused
+      return;
+    // Be sure that the stack pointer is set
+    // GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
+    uint32_t stackSize = 1*KB;
+    while (stackSize < fn.getStackSize()) {
+      stackSize <<= 1;
+      GBE_ASSERT(stackSize <= 64*KB);
+    }
+    this->kernel->stackSize = stackSize;
+  }
+
+  uint32_t Context::newCurbeEntry(gbe_curbe_type value,
+                              uint32_t subValue,
+                              uint32_t size,
+                              uint32_t alignment)
+  {
+    alignment = alignment == 0 ? size : alignment;
+    const uint32_t offset = registerAllocator->allocate(size, alignment, 1);
+    GBE_ASSERT(offset >= GEN_REG_SIZE);
+    kernel->patches.push_back(PatchInfo(value, subValue, offset - GEN_REG_SIZE));
+    kernel->curbeSize = std::max(kernel->curbeSize, offset + size - GEN_REG_SIZE);
+    return offset;
+  }
+
+  uint32_t Context::getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size)
+  {
+    int32_t offset = fn.getImageSet()->getInfoOffset(key);
+    if (offset >= 0)
+      return offset + GEN_REG_SIZE;
+    newCurbeEntry(GBE_CURBE_IMAGE_INFO, key.data, size, 4);
+    std::sort(kernel->patches.begin(), kernel->patches.end());
+
+    offset = kernel->getCurbeOffset(GBE_CURBE_IMAGE_INFO, key.data);
+    GBE_ASSERT(offset >= 0); // XXX do we need to spill it out to bo?
+    fn.getImageSet()->appendInfo(key, offset);
+    return offset + GEN_REG_SIZE;
+  }
+
+  void Context::insertCurbeReg(ir::Register reg, uint32_t offset) {
+    curbeRegs.insert(std::make_pair(reg, offset));
+  }
+  ir::Register Context::getSurfaceBaseReg(unsigned char bti) {
+    return fn.getSurfaceBaseReg(bti);
+  }
+
+  void Context::buildArgList(void) {
+    kernel->argNum = fn.argNum();
+    if (kernel->argNum)
+      kernel->args = GBE_NEW_ARRAY_NO_ARG(KernelArgument, kernel->argNum);
+    else
+      kernel->args = NULL;
+    for (uint32_t argID = 0; argID < kernel->argNum; ++argID) {
+      const auto &arg = fn.getArg(argID);
+
+      kernel->args[argID].align = arg.align;
+      kernel->args[argID].info = arg.info;
+      switch (arg.type) {
+        case ir::FunctionArgument::VALUE:
+        case ir::FunctionArgument::STRUCTURE:
+          kernel->args[argID].type = GBE_ARG_VALUE;
+          kernel->args[argID].size = arg.size;
+          break;
+        case ir::FunctionArgument::GLOBAL_POINTER:
+          kernel->args[argID].type = GBE_ARG_GLOBAL_PTR;
+          kernel->args[argID].size = sizeof(void*);
+          kernel->args[argID].bti = arg.bti;
+          break;
+        case ir::FunctionArgument::CONSTANT_POINTER:
+          kernel->args[argID].type = GBE_ARG_CONSTANT_PTR;
+          kernel->args[argID].size = sizeof(void*);
+          break;
+        case ir::FunctionArgument::LOCAL_POINTER:
+          kernel->args[argID].type = GBE_ARG_LOCAL_PTR;
+          kernel->args[argID].size = 0;
+          break;
+        case ir::FunctionArgument::IMAGE:
+          kernel->args[argID].type = GBE_ARG_IMAGE;
+          kernel->args[argID].size = sizeof(void*);
+          break;
+        case ir::FunctionArgument::SAMPLER:
+          kernel->args[argID].type = GBE_ARG_SAMPLER;
+          kernel->args[argID].size = sizeof(void*);
+          break;
+      }
+    }
+  }
+
+  void Context::buildUsedLabels(void) {
+    usedLabels.clear();
+    fn.foreachInstruction([this](const ir::Instruction &insn) {
+      using namespace ir;
+      if (insn.getOpcode() != OP_BRA) return;
+      const LabelIndex index = cast<BranchInstruction>(insn).getLabelIndex();
+      usedLabels.insert(index);
+    });
+  }
+
+  void Context::buildJIPs(void) {
+    using namespace ir;
+
+    // Linearly store the branch target for each block and its own label
+    const LabelIndex noTarget(fn.labelNum());
+    vector<std::pair<LabelIndex, LabelIndex>> braTargets;
+    int32_t curr = 0, blockNum = fn.blockNum();
+    braTargets.resize(blockNum);
+
+    // If some blocks are unused we mark them as such by setting their own label
+    // as "invalid" (== noTarget)
+    for (auto &bb : braTargets) bb = std::make_pair(noTarget, noTarget);
+    fn.foreachBlock([&](const BasicBlock &bb) {
+      const LabelIndex ownLabel = bb.getLabelIndex();
+      const Instruction *last = bb.getLastInstruction();
+      if (last->getOpcode() != OP_BRA)
+        braTargets[curr++] = std::make_pair(ownLabel, noTarget);
+      else {
+        const BranchInstruction *bra = cast<BranchInstruction>(last);
+        braTargets[curr++] = std::make_pair(ownLabel, bra->getLabelIndex());
+      }
+    });
+
+    // Backward jumps are special. We must insert the label of the next block
+    // when we hit the "DO" i.e. the target label of the backward branch (as in
+    // do { } while) . So, we store the bwd jumps per targets
+    // XXX does not use custom allocator
+    std::multimap<LabelIndex, LabelIndex> bwdTargets;
+    for (int32_t blockID = 0; blockID < blockNum; ++blockID) {
+      const LabelIndex ownLabel = braTargets[blockID].first;
+      const LabelIndex target = braTargets[blockID].second;
+      if (ownLabel == noTarget) continue; // unused block
+      if (target == noTarget) continue; // no branch
+      if (target <= ownLabel) { // This is a backward jump
+        // Last block is just "RET". So, it cannot be the last block
+        GBE_ASSERT(blockID < blockNum - 1);
+        const LabelIndex fallThrough = braTargets[blockID+1].first;
+        bwdTargets.insert(std::make_pair(target, fallThrough));
+      }
+    }
+
+    // Stores the current forward targets
+    set<LabelIndex> fwdTargets;
+
+    // Now retraverse the blocks and figure out all JIPs
+    for (int32_t blockID = 0; blockID < blockNum; ++blockID) {
+      const LabelIndex ownLabel = braTargets[blockID].first;
+      const LabelIndex target = braTargets[blockID].second;
+      const BasicBlock &bb = fn.getBlock(ownLabel);
+      const Instruction *label = bb.getFirstInstruction();
+      const Instruction *bra = bb.getLastInstruction();
+
+      // Expires the branches that point to us (if any)
+      auto it = fwdTargets.find(ownLabel);
+      if (it != fwdTargets.end()) fwdTargets.erase(it);
+
+      // Insert the fall through of the bwd branches that point to us if any
+      auto ii = bwdTargets.equal_range(ownLabel);
+      for (auto it = ii.first; it != ii.second; ++it)
+        fwdTargets.insert(it->second);
+
+      // If there is an outstanding forward branch, compute a JIP for the label
+      auto lower = fwdTargets.lower_bound(LabelIndex(0));
+      GBE_ASSERT(label->isMemberOf<LabelInstruction>() == true);
+      if (lower != fwdTargets.end())
+        JIPs.insert(std::make_pair(label, *lower));
+
+      // Handle special cases and backward branches first
+      if (ownLabel == noTarget) continue; // unused block
+      if (target == noTarget) continue; // no branch at all
+      GBE_ASSERT(bra->isMemberOf<BranchInstruction>() == true);
+      if (target <= ownLabel) { // bwd branch: we always jump
+        JIPs.insert(std::make_pair(bra, LabelIndex(target)));
+        continue;
+      }
+
+      // This is a forward jump, register it and get the JIP
+      fwdTargets.insert(target);
+      auto jip = fwdTargets.lower_bound(LabelIndex(0));
+      JIPs.insert(std::make_pair(bra, *jip));
+    }
+  }
+
+  void Context::handleSLM(void) {
+    const bool useSLM = fn.getUseSLM();
+    kernel->useSLM = useSLM;
+    kernel->slmSize = fn.getSLMSize();
+  }
+
+} /* namespace gbe */
+
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
new file mode 100644
index 0000000..3faead2
--- /dev/null
+++ b/backend/src/backend/context.hpp
@@ -0,0 +1,149 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_CONTEXT_HPP__
+#define __GBE_CONTEXT_HPP__
+
+#include "ir/instruction.hpp"
+#include "backend/program.h"
+#include "sys/set.hpp"
+#include "sys/map.hpp"
+#include "sys/platform.hpp"
+#include <string>
+
+namespace gbe {
+namespace ir {
+
+  class Unit;        // Contains the complete program
+  class Function;    // We compile a function into a kernel
+  class Liveness;    // Describes liveness of each ir function register
+  class FunctionDAG; // Describes the instruction dependencies
+
+} /* namespace ir */
+} /* namespace gbe */
+
+namespace gbe
+{
+  class Kernel;                 // context creates Kernel
+  class RegisterAllocator;      // allocator for physical register allocation
+  class ScratchAllocator;       // allocator for scratch memory allocation
+
+  /*! Context is the helper structure to build the Gen ISA or simulation code
+   *  from GenIR
+   */
+  class Context : public NonCopyable
+  {
+  public:
+    /*! Create a new context. name is the name of the function we want to
+     *  compile
+     */
+    Context(const ir::Unit &unit, const std::string &name);
+    /*! Release everything needed */
+    virtual ~Context(void);
+    /*! start new code generation with specific simd width. */
+    void startNewCG(uint32_t simdWidth);
+    /*! Compile the code */
+    Kernel *compileKernel(void);
+    /*! Tells if the labels is used */
+    INLINE bool isLabelUsed(ir::LabelIndex index) const {
+      return usedLabels.contains(index);
+    }
+    /*! Get the function graph */
+    INLINE const ir::FunctionDAG &getFunctionDAG(void) const { return *dag; }
+    /*! Get the liveness information */
+    INLINE const ir::Liveness &getLiveness(void) const { return *liveness; }
+    /*! Tells if the register is used */
+    bool isRegUsed(const ir::Register &reg) const;
+    /*! Get the kernel we are currently compiling */
+    INLINE Kernel *getKernel(void) const { return this->kernel; }
+    /*! Get the function we are currently compiling */
+    INLINE const ir::Function &getFunction(void) const { return this->fn; }
+    /*! Get the target label index for the given instruction */
+    INLINE ir::LabelIndex getLabelIndex(const ir::Instruction *insn) const {
+      GBE_ASSERT(JIPs.find(insn) != JIPs.end());
+      return JIPs.find(insn)->second;
+    }
+    /*! Only GOTO and some LABEL instructions may have JIPs */
+    INLINE bool hasJIP(const ir::Instruction *insn) const {
+      return JIPs.find(insn) != JIPs.end();
+    }
+    /*! Allocate some memory in the register file */
+    int16_t allocate(int16_t size, int16_t alignment);
+    /*! Deallocate previously allocated memory */
+    void deallocate(int16_t offset);
+    /*! Spilt a block into 2 blocks, for some registers allocate together but  deallocate seperate */
+    void splitBlock(int16_t offset, int16_t subOffset);
+    /* allocate a new entry for a specific image's information */
+    /*! Get (search or allocate if fail to find one) image info curbeOffset.*/
+    uint32_t getImageInfoCurbeOffset(ir::ImageInfoKey key, size_t size);
+    /*! allocate size scratch memory and return start address */
+    int32_t allocateScratchMem(uint32_t size);
+    /*! deallocate scratch memory at offset */
+    void deallocateScratchMem(int32_t offset);
+    /*! Preallocated curbe register set including special registers. */
+    map<ir::Register, uint32_t> curbeRegs;
+    ir::Register getSurfaceBaseReg(unsigned char bti);
+  protected:
+    /*! Build the instruction stream. Return false if failed */
+    virtual bool emitCode(void) = 0;
+    /*! Align the scratch size to the device's scratch unit size */
+    virtual uint32_t alignScratchSize(uint32_t) = 0;
+    /*! Get the device's max srcatch size */
+    virtual uint32_t getScratchSize(void) = 0;
+    /*! Allocate a new empty kernel (to be implemented) */
+    virtual Kernel *allocateKernel(void) = 0;
+    /*! Look if a stack is needed and allocate it */
+    void buildStack(void);
+    /*! Build the list of arguments to set to launch the kernel */
+    void buildArgList(void);
+    /*! Build the sets of used labels */
+    void buildUsedLabels(void);
+    /*! Build JIPs for each branch and possibly labels. Can be different from
+     *  the branch target due to unstructured branches
+     */
+    void buildJIPs(void);
+    /*! Configure SLM use if needed */
+    void handleSLM(void);
+    /*! Insert a new entry with the given size in the Curbe. Return the offset
+     *  of the entry
+     */
+    void insertCurbeReg(ir::Register, uint32_t grfOffset);
+    /*! allocate a curbe entry. */
+    uint32_t newCurbeEntry(gbe_curbe_type value, uint32_t subValue, uint32_t size, uint32_t alignment = 0);
+    /*! Provide for each branch and label the label index target */
+    typedef map<const ir::Instruction*, ir::LabelIndex> JIPMap;
+    const ir::Unit &unit;                 //!< Unit that contains the kernel
+    const ir::Function &fn;               //!< Function to compile
+    std::string name;                     //!< Name of the kernel to compile
+    Kernel *kernel;                       //!< Kernel we are building
+    ir::Liveness *liveness;               //!< Liveness info for the variables
+    ir::FunctionDAG *dag;                 //!< Graph of values on the function
+    RegisterAllocator *registerAllocator; //!< physical register allocation
+    ScratchAllocator *scratchAllocator;   //!< scratch memory allocator
+    set<ir::LabelIndex> usedLabels;       //!< Set of all used labels
+    JIPMap JIPs;                          //!< Where to jump all labels/branches
+    uint32_t simdWidth;                   //!< Number of lanes per HW threads
+    map<unsigned char, ir::Register> btiRegMap;
+    GBE_CLASS(Context);                   //!< Use custom allocators
+  };
+
+} /* namespace gbe */
+
+#endif /* __GBE_CONTEXT_HPP__ */
+
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
new file mode 100644
index 0000000..c120b60
--- /dev/null
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -0,0 +1,1302 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright � 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include "backend/gen_defs.hpp"
+#include "src/cl_device_data.h"
+
+static const struct {
+  const char    *name;
+  int	    nsrc;
+  int	    ndst;
+} opcode[128] = {
+  [GEN_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_FBH] = { .name = "fbh", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_FBL] = { .name = "fbl", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_F16TO32] = { .name = "f16to32", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_F32TO16] = { .name = "f32to16", .nsrc = 1, .ndst = 1 },
+
+  [GEN_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_MAD] = { .name = "mad", .nsrc = 3, .ndst = 1 },
+  [GEN_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
+
+  [GEN_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_ADDC] = { .name = "addc", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SUBB] = { .name = "subb", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
+  [GEN_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
+
+  [GEN_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_BRD] = { .name = "brd", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_IF] = { .name = "if", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_BRC] = { .name = "brc", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_WHILE] = { .name = "while", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_ELSE] = { .name = "else", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_BREAK] = { .name = "break", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
+  [GEN_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
+  [GEN_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
+  [GEN_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
+  [GEN_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
+  [GEN_OPCODE_ENDIF] = { .name = "endif", .nsrc = 1, .ndst = 0 },
+};
+
+static const char *conditional_modifier[16] = {
+  [GEN_CONDITIONAL_NONE] = "",
+  [GEN_CONDITIONAL_Z] = ".e",
+  [GEN_CONDITIONAL_NZ] = ".ne",
+  [GEN_CONDITIONAL_G] = ".g",
+  [GEN_CONDITIONAL_GE] = ".ge",
+  [GEN_CONDITIONAL_L] = ".l",
+  [GEN_CONDITIONAL_LE] = ".le",
+  [GEN_CONDITIONAL_R] = ".r",
+  [GEN_CONDITIONAL_O] = ".o",
+  [GEN_CONDITIONAL_U] = ".u",
+};
+
+static const char *negate[2] = {
+  [0] = "",
+  [1] = "-",
+};
+
+static const char *_abs[2] = {
+  [0] = "",
+  [1] = "(abs)",
+};
+
+static const char *vert_stride[16] = {
+  [0] = "0",
+  [1] = "1",
+  [2] = "2",
+  [3] = "4",
+  [4] = "8",
+  [5] = "16",
+  [6] = "32",
+  [15] = "VxH",
+};
+
+static const char *width[8] = {
+  [0] = "1",
+  [1] = "2",
+  [2] = "4",
+  [3] = "8",
+  [4] = "16",
+};
+
+static const char *horiz_stride[4] = {
+  [0] = "0",
+  [1] = "1",
+  [2] = "2",
+  [3] = "4"
+};
+
+static const char *chan_sel[4] = {
+  [0] = "x",
+  [1] = "y",
+  [2] = "z",
+  [3] = "w",
+};
+
+static const char *debug_ctrl[2] = {
+  [0] = "",
+  [1] = ".breakpoint"
+};
+
+static const char *saturate[2] = {
+  [0] = "",
+  [1] = ".sat"
+};
+
+static const char *accwr[2] = {
+  [0] = "",
+  [1] = "AccWrEnable"
+};
+
+static const char *wectrl[2] = {
+  [0] = "WE_normal",
+  [1] = "WE_all"
+};
+
+static const char *exec_size[8] = {
+  [0] = "1",
+  [1] = "2",
+  [2] = "4",
+  [3] = "8",
+  [4] = "16",
+  [5] = "32"
+};
+
+static const char *pred_inv[2] = {
+  [0] = "+",
+  [1] = "-"
+};
+
+static const char *pred_ctrl_align16[16] = {
+  [1] = "",
+  [2] = ".x",
+  [3] = ".y",
+  [4] = ".z",
+  [5] = ".w",
+  [6] = ".any4h",
+  [7] = ".all4h",
+};
+
+static const char *pred_ctrl_align1[16] = {
+  [1] = "",
+  [2] = ".anyv",
+  [3] = ".allv",
+  [4] = ".any2h",
+  [5] = ".all2h",
+  [6] = ".any4h",
+  [7] = ".all4h",
+  [8] = ".any8h",
+  [9] = ".all8h",
+  [10] = ".any16h",
+  [11] = ".all16h",
+};
+
+static const char *thread_ctrl[4] = {
+  [0] = "",
+  [2] = "switch"
+};
+
+static const char *dep_ctrl[4] = {
+  [0] = "",
+  [1] = "NoDDClr",
+  [2] = "NoDDChk",
+  [3] = "NoDDClr,NoDDChk",
+};
+
+static const char *mask_ctrl[4] = {
+  [0] = "",
+  [1] = "nomask",
+};
+
+static const char *access_mode[2] = {
+  [0] = "align1",
+  [1] = "align16",
+};
+
+static const char *reg_encoding[8] = {
+  [0] = ":UD",
+  [1] = ":D",
+  [2] = ":UW",
+  [3] = ":W",
+  [4] = ":UB",
+  [5] = ":B",
+  [6] = ":DF",
+  [7] = ":F"
+};
+
+int reg_type_size[8] = {
+  [0] = 4,
+  [1] = 4,
+  [2] = 2,
+  [3] = 2,
+  [4] = 1,
+  [5] = 1,
+  [6] = 8,
+  [7] = 4
+};
+
+static const char *reg_file[4] = {
+  [0] = "A",
+  [1] = "g",
+  [2] = "m",
+  [3] = "imm",
+};
+
+static const char *writemask[16] = {
+  [0x0] = ".",
+  [0x1] = ".x",
+  [0x2] = ".y",
+  [0x3] = ".xy",
+  [0x4] = ".z",
+  [0x5] = ".xz",
+  [0x6] = ".yz",
+  [0x7] = ".xyz",
+  [0x8] = ".w",
+  [0x9] = ".xw",
+  [0xa] = ".yw",
+  [0xb] = ".xyw",
+  [0xc] = ".zw",
+  [0xd] = ".xzw",
+  [0xe] = ".yzw",
+  [0xf] = "",
+};
+
+static const char *end_of_thread[2] = {
+  [0] = "",
+  [1] = "EOT"
+};
+
+static const char *target_function_gen6[16] = {
+  [GEN_SFID_NULL] = "null",
+  [GEN_SFID_MATH] = "math",
+  [GEN_SFID_SAMPLER] = "sampler",
+  [GEN_SFID_MESSAGE_GATEWAY] = "gateway",
+  [GEN_SFID_URB] = "urb",
+  [GEN_SFID_THREAD_SPAWNER] = "thread_spawner",
+  [GEN6_SFID_DATAPORT_SAMPLER_CACHE] = "sampler",
+  [GEN6_SFID_DATAPORT_RENDER_CACHE] = "render",
+  [GEN6_SFID_DATAPORT_CONSTANT_CACHE] = "const",
+  [GEN_SFID_DATAPORT_DATA_CACHE] = "data"
+};
+
+static const char *target_function_gen75[16] = {
+  [GEN_SFID_NULL] = "null",
+  [GEN_SFID_MATH] = "math",
+  [GEN_SFID_SAMPLER] = "sampler",
+  [GEN_SFID_MESSAGE_GATEWAY] = "gateway",
+  [GEN_SFID_URB] = "urb",
+  [GEN_SFID_THREAD_SPAWNER] = "thread_spawner",
+  [GEN6_SFID_DATAPORT_SAMPLER_CACHE] = "sampler",
+  [GEN6_SFID_DATAPORT_RENDER_CACHE] = "render",
+  [GEN6_SFID_DATAPORT_CONSTANT_CACHE] = "const",
+  [GEN_SFID_DATAPORT_DATA_CACHE] = "data (0)",
+  [GEN_SFID_DATAPORT1_DATA_CACHE] = "data (1)"
+};
+
+static const char *gateway_sub_function[8] = {
+  [0] = "open gateway",
+  [1] = "close gateway",
+  [2] = "forward gateway",
+  [3] = "get time stamp",
+  [4] = "barrier",
+  [5] = "update gateway state",
+  [6] = "MMIO R/W",
+  [7] = "reserved"
+};
+
+static const char *math_function[16] = {
+  [GEN_MATH_FUNCTION_INV] = "inv",
+  [GEN_MATH_FUNCTION_LOG] = "log",
+  [GEN_MATH_FUNCTION_EXP] = "exp",
+  [GEN_MATH_FUNCTION_SQRT] = "sqrt",
+  [GEN_MATH_FUNCTION_RSQ] = "rsq",
+  [GEN_MATH_FUNCTION_SIN] = "sin",
+  [GEN_MATH_FUNCTION_COS] = "cos",
+  [GEN_MATH_FUNCTION_FDIV] = "fdiv",
+  [GEN_MATH_FUNCTION_POW] = "pow",
+  [GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
+  [GEN_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intdiv",
+  [GEN_MATH_FUNCTION_INT_DIV_REMAINDER] = "intmod",
+};
+
+static const char *math_saturate[2] = {
+  [0] = "",
+  [1] = "sat"
+};
+
+static const char *math_signed[2] = {
+  [0] = "",
+  [1] = "signed"
+};
+
+static const char *math_scalar[2] = {
+  [0] = "",
+  [1] = "scalar"
+};
+
+static const char *math_precision[2] = {
+  [0] = "",
+  [1] = "partial_precision"
+};
+
+static const char *data_port_data_cache_simd_mode[] = {
+  "SIMD4x2",
+  "SIMD16",
+  "SIMD8",
+};
+
+static const char *data_port_data_cache_category[] = {
+  "legacy",
+  "scratch",
+};
+
+static const char *data_port_scratch_block_size[] = {
+  "1 register",
+  "2 registers",
+  "Reserve",
+  "4 registers",
+};
+
+static const char *data_port_scratch_invalidate[] = {
+  "no invalidate",
+  "invalidate cache line",
+};
+
+static const char *data_port_scratch_channel_mode[] = {
+  "Oword",
+  "Dword",
+};
+
+static const char *data_port_scratch_msg_type[] = {
+  "Scratch Read",
+  "Scratch Write",
+};
+
+static const char *data_port_data_cache_msg_type[] = {
+  [0] = "OWord Block Read",
+  [1] = "Unaligned OWord Block Read",
+  [2] = "OWord Dual Block Read",
+  [3] = "DWord Scattered Read",
+  [4] = "Byte Scattered Read",
+  [5] = "Untyped Surface Read",
+  [6] = "Untyped Atomic Operation",
+  [7] = "Memory Fence",
+  [8] = "OWord Block Write",
+  [10] = "OWord Dual Block Write",
+  [11] = "DWord Scattered Write",
+  [12] = "Byte Scattered Write",
+  [13] = "Untyped Surface Write",
+};
+
+static const char *data_port1_data_cache_msg_type[] = {
+  [1] = "Untyped Surface Read",
+  [2] = "Untyped Atomic Operation",
+  [3] = "Untyped Atomic Operation SIMD4x2",
+  [4] = "Media Block Read",
+  [5] = "Typed Surface Read",
+  [6] = "Typed Atomic Operation",
+  [7] = "Typed Atomic Operation SIMD4x2",
+  [9] = "Untyped Surface Write",
+  [10] = "Media Block Write",
+  [11] = "Atomic Counter Operation",
+  [12] = "Atomic Counter Operation 4X2",
+  [13] = "Typed Surface Write",
+};
+
+static int column;
+
+static int string (FILE *file, const char *string)
+{
+  fputs (string, file);
+  column += strlen (string);
+  return 0;
+}
+
+static int format (FILE *f, const char *format, ...)
+{
+  char    buf[1024];
+  va_list	args;
+  va_start (args, format);
+
+  vsnprintf (buf, sizeof (buf) - 1, format, args);
+  va_end (args);
+  string (f, buf);
+  return 0;
+}
+
+static int newline (FILE *f)
+{
+  putc ('\n', f);
+  column = 0;
+  return 0;
+}
+
+static int pad (FILE *f, int c)
+{
+  do
+    string (f, " ");
+  while (column < c);
+  return 0;
+}
+
+static int flag_reg (FILE *file, const int flag_nr, const int flag_sub_reg_nr)
+{
+  if (flag_nr || flag_sub_reg_nr)
+    return format (file, ".f%d.%d", flag_nr, flag_sub_reg_nr);
+  return 0;
+}
+
+static int control (FILE *file, const char *name, const char *ctrl[], uint32_t id, int *space)
+{
+  if (!ctrl[id]) {
+    fprintf (file, "*** invalid %s value %d ",
+        name, id);
+    return 1;
+  }
+  if (ctrl[id][0])
+  {
+    if (space && *space)
+      string (file, " ");
+    string (file, ctrl[id]);
+    if (space)
+      *space = 1;
+  }
+  return 0;
+}
+
+static int print_opcode (FILE *file, int id)
+{
+  if (!opcode[id].name) {
+    format (file, "*** invalid opcode value %d ", id);
+    return 1;
+  }
+  string (file, opcode[id].name);
+  return 0;
+}
+
+static int reg (FILE *file, uint32_t _reg_file, uint32_t _reg_nr)
+{
+  int	err = 0;
+
+  if (_reg_file == GEN_ARCHITECTURE_REGISTER_FILE) {
+    switch (_reg_nr & 0xf0) {
+      case GEN_ARF_NULL:
+        string (file, "null");
+        return -1;
+      case GEN_ARF_ADDRESS:
+        format (file, "a%d", _reg_nr & 0x0f);
+        break;
+      case GEN_ARF_ACCUMULATOR:
+        format (file, "acc%d", _reg_nr & 0x0f);
+        break;
+      case GEN_ARF_FLAG:
+        format (file, "f%d", _reg_nr & 0x0f);
+        break;
+      case GEN_ARF_MASK:
+        format (file, "mask%d", _reg_nr & 0x0f);
+        break;
+      case GEN_ARF_MASK_STACK:
+        format (file, "msd%d", _reg_nr & 0x0f);
+        break;
+      case GEN_ARF_STATE:
+        format (file, "sr%d", _reg_nr & 0x0f);
+        break;
+      case GEN_ARF_CONTROL:
+        format (file, "cr%d", _reg_nr & 0x0f);
+        break;
+      case GEN_ARF_NOTIFICATION_COUNT:
+        format (file, "n%d", _reg_nr & 0x0f);
+        break;
+      case GEN_ARF_IP:
+        string (file, "ip");
+        return -1;
+        break;
+      default:
+        format (file, "ARF%d", _reg_nr);
+        break;
+    }
+  } else {
+    err  |= control (file, "src reg file", reg_file, _reg_file, NULL);
+    format (file, "%d", _reg_nr);
+  }
+  return err;
+}
+
+static int dest (FILE *file, const union GenNativeInstruction *inst)
+{
+  int	err = 0;
+
+  if (inst->header.access_mode == GEN_ALIGN_1)
+  {
+    if (inst->bits1.da1.dest_address_mode == GEN_ADDRESS_DIRECT)
+    {
+      err |= reg (file, inst->bits1.da1.dest_reg_file, inst->bits1.da1.dest_reg_nr);
+      if (err == -1) {
+        control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
+        return 0;
+      }
+      if (inst->bits1.da1.dest_subreg_nr)
+        format (file, ".%d", inst->bits1.da1.dest_subreg_nr /
+            reg_type_size[inst->bits1.da1.dest_reg_type]);
+      format (file, "<%s>", horiz_stride[inst->bits1.da1.dest_horiz_stride]);
+      err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
+    }
+    else
+    {
+      string (file, "g[a0");
+      if (inst->bits1.ia1.dest_subreg_nr)
+        format (file, ".%d", inst->bits1.ia1.dest_subreg_nr /
+            reg_type_size[inst->bits1.ia1.dest_reg_type]);
+      if (inst->bits1.ia1.dest_indirect_offset)
+        format (file, " %d", inst->bits1.ia1.dest_indirect_offset);
+      string (file, "]");
+      format (file, "<%s>", horiz_stride[inst->bits1.ia1.dest_horiz_stride]);
+      err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.ia1.dest_reg_type, NULL);
+    }
+  }
+  else
+  {
+    if (inst->bits1.da16.dest_address_mode == GEN_ADDRESS_DIRECT)
+    {
+      err |= reg (file, inst->bits1.da16.dest_reg_file, inst->bits1.da16.dest_reg_nr);
+      if (err == -1)
+        return 0;
+      if (inst->bits1.da16.dest_subreg_nr)
+        format (file, ".%d", inst->bits1.da16.dest_subreg_nr /
+            reg_type_size[inst->bits1.da16.dest_reg_type]);
+      string (file, "<1>");
+      err |= control (file, "writemask", writemask, inst->bits1.da16.dest_writemask, NULL);
+      err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da16.dest_reg_type, NULL);
+    }
+    else
+    {
+      err = 1;
+      string (file, "Indirect align16 address mode not supported");
+    }
+  }
+
+  return 0;
+}
+
+static int dest_3src (FILE *file, const union GenNativeInstruction *inst)
+{
+  int	err = 0;
+  const uint32_t reg_file = GEN_GENERAL_REGISTER_FILE;
+
+  err |= reg (file, reg_file, inst->bits1.da3src.dest_reg_nr);
+  if (err == -1)
+    return 0;
+  if (inst->bits1.da3src.dest_subreg_nr)
+    format (file, ".%d", inst->bits1.da3src.dest_subreg_nr);
+  string (file, "<1>");
+  err |= control (file, "writemask", writemask, inst->bits1.da3src.dest_writemask, NULL);
+  err |= control (file, "dest reg encoding", reg_encoding, GEN_TYPE_F, NULL);
+
+  return 0;
+}
+
+static int src_align1_region (FILE *file,
+    uint32_t _vert_stride, uint32_t _width, uint32_t _horiz_stride)
+{
+  int err = 0;
+  string (file, "<");
+  err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+  string (file, ",");
+  err |= control (file, "width", width, _width, NULL);
+  string (file, ",");
+  err |= control (file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
+  string (file, ">");
+  return err;
+}
+
+static int src_da1 (FILE *file, uint32_t type, uint32_t _reg_file,
+    uint32_t _vert_stride, uint32_t _width, uint32_t _horiz_stride,
+    uint32_t reg_num, uint32_t sub_reg_num, uint32_t __abs, uint32_t _negate)
+{
+  int err = 0;
+  err |= control (file, "negate", negate, _negate, NULL);
+  err |= control (file, "abs", _abs, __abs, NULL);
+
+  err |= reg (file, _reg_file, reg_num);
+  if (err == -1)
+    return 0;
+  if (sub_reg_num)
+    format (file, ".%d", sub_reg_num / reg_type_size[type]); /* use formal style like spec */
+  src_align1_region (file, _vert_stride, _width, _horiz_stride);
+  err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+  return err;
+}
+
+static int src_ia1 (FILE *file,
+                    uint32_t type,
+                    uint32_t _reg_file,
+                    int32_t _addr_imm,
+                    uint32_t _addr_subreg_nr,
+                    uint32_t _negate,
+                    uint32_t __abs,
+                    uint32_t _addr_mode,
+                    uint32_t _horiz_stride,
+                    uint32_t _width,
+                    uint32_t _vert_stride)
+{
+  int err = 0;
+  err |= control (file, "negate", negate, _negate, NULL);
+  err |= control (file, "abs", _abs, __abs, NULL);
+
+  string (file, "g[a0");
+  if (_addr_subreg_nr)
+    format (file, ".%d", _addr_subreg_nr);
+  if (_addr_imm)
+    format (file, " %d", _addr_imm);
+  string (file, "]");
+  src_align1_region (file, _vert_stride, _width, _horiz_stride);
+  err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+  return err;
+}
+
+static int src_da16 (FILE *file,
+                     uint32_t _reg_type,
+                     uint32_t _reg_file,
+                     uint32_t _vert_stride,
+                     uint32_t _reg_nr,
+                     uint32_t _subreg_nr,
+                     uint32_t __abs,
+                     uint32_t _negate,
+                     uint32_t swz_x,
+                     uint32_t swz_y,
+                     uint32_t swz_z,
+                     uint32_t swz_w)
+{
+  int err = 0;
+  err |= control (file, "negate", negate, _negate, NULL);
+  err |= control (file, "abs", _abs, __abs, NULL);
+
+  err |= reg (file, _reg_file, _reg_nr);
+  if (err == -1)
+    return 0;
+  if (_subreg_nr)
+    /* bit4 for subreg number byte addressing. Make this same meaning as
+       in da1 case, so output looks consistent. */
+    format (file, ".%d", 16 / reg_type_size[_reg_type]);
+  string (file, "<");
+  err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+  string (file, ",4,1>");
+  /*
+   * Three kinds of swizzle display:
+   *  identity - nothing printed
+   *  1->all	 - print the single channel
+   *  1->1     - print the mapping
+   */
+  if (swz_x == GEN_CHANNEL_X &&
+      swz_y == GEN_CHANNEL_Y &&
+      swz_z == GEN_CHANNEL_Z &&
+      swz_w == GEN_CHANNEL_W)
+  {
+    ;
+  }
+  else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+  {
+    string (file, ".");
+    err |= control (file, "channel select", chan_sel, swz_x, NULL);
+  }
+  else
+  {
+    string (file, ".");
+    err |= control (file, "channel select", chan_sel, swz_x, NULL);
+    err |= control (file, "channel select", chan_sel, swz_y, NULL);
+    err |= control (file, "channel select", chan_sel, swz_z, NULL);
+    err |= control (file, "channel select", chan_sel, swz_w, NULL);
+  }
+  err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL);
+  return err;
+}
+
+static int src0_3src (FILE *file, const union GenNativeInstruction *inst)
+{
+  int err = 0;
+  uint32_t swz_x = (inst->bits2.da3src.src0_swizzle >> 0) & 0x3;
+  uint32_t swz_y = (inst->bits2.da3src.src0_swizzle >> 2) & 0x3;
+  uint32_t swz_z = (inst->bits2.da3src.src0_swizzle >> 4) & 0x3;
+  uint32_t swz_w = (inst->bits2.da3src.src0_swizzle >> 6) & 0x3;
+
+  err |= control (file, "negate", negate, inst->bits1.da3src.src0_negate, NULL);
+  err |= control (file, "abs", _abs, inst->bits1.da3src.src0_abs, NULL);
+
+  err |= reg (file, GEN_GENERAL_REGISTER_FILE, inst->bits2.da3src.src0_reg_nr);
+  if (err == -1)
+    return 0;
+  if (inst->bits2.da3src.src0_subreg_nr)
+    format (file, ".%d", inst->bits2.da3src.src0_subreg_nr);
+  string (file, "<4,1,1>");
+  err |= control (file, "src da16 reg type", reg_encoding,
+      GEN_TYPE_F, NULL);
+  /*
+   * Three kinds of swizzle display:
+   *  identity - nothing printed
+   *  1->all	 - print the single channel
+   *  1->1     - print the mapping
+   */
+  if (swz_x == GEN_CHANNEL_X &&
+      swz_y == GEN_CHANNEL_Y &&
+      swz_z == GEN_CHANNEL_Z &&
+      swz_w == GEN_CHANNEL_W)
+  {
+    ;
+  }
+  else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+  {
+    string (file, ".");
+    err |= control (file, "channel select", chan_sel, swz_x, NULL);
+  }
+  else
+  {
+    string (file, ".");
+    err |= control (file, "channel select", chan_sel, swz_x, NULL);
+    err |= control (file, "channel select", chan_sel, swz_y, NULL);
+    err |= control (file, "channel select", chan_sel, swz_z, NULL);
+    err |= control (file, "channel select", chan_sel, swz_w, NULL);
+  }
+  return err;
+}
+
+static int src1_3src (FILE *file, const union GenNativeInstruction *inst)
+{
+  int err = 0;
+  uint32_t swz_x = (inst->bits2.da3src.src1_swizzle >> 0) & 0x3;
+  uint32_t swz_y = (inst->bits2.da3src.src1_swizzle >> 2) & 0x3;
+  uint32_t swz_z = (inst->bits2.da3src.src1_swizzle >> 4) & 0x3;
+  uint32_t swz_w = (inst->bits2.da3src.src1_swizzle >> 6) & 0x3;
+  uint32_t src1_subreg_nr = (inst->bits2.da3src.src1_subreg_nr_low |
+      (inst->bits3.da3src.src1_subreg_nr_high << 2));
+
+  err |= control (file, "negate", negate, inst->bits1.da3src.src1_negate,
+      NULL);
+  err |= control (file, "abs", _abs, inst->bits1.da3src.src1_abs, NULL);
+
+  err |= reg (file, GEN_GENERAL_REGISTER_FILE,
+      inst->bits3.da3src.src1_reg_nr);
+  if (err == -1)
+    return 0;
+  if (src1_subreg_nr)
+    format (file, ".%d", src1_subreg_nr);
+  string (file, "<4,1,1>");
+  err |= control (file, "src da16 reg type", reg_encoding,
+      GEN_TYPE_F, NULL);
+  /*
+   * Three kinds of swizzle display:
+   *  identity - nothing printed
+   *  1->all	 - print the single channel
+   *  1->1     - print the mapping
+   */
+  if (swz_x == GEN_CHANNEL_X &&
+      swz_y == GEN_CHANNEL_Y &&
+      swz_z == GEN_CHANNEL_Z &&
+      swz_w == GEN_CHANNEL_W)
+  {
+    ;
+  }
+  else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+  {
+    string (file, ".");
+    err |= control (file, "channel select", chan_sel, swz_x, NULL);
+  }
+  else
+  {
+    string (file, ".");
+    err |= control (file, "channel select", chan_sel, swz_x, NULL);
+    err |= control (file, "channel select", chan_sel, swz_y, NULL);
+    err |= control (file, "channel select", chan_sel, swz_z, NULL);
+    err |= control (file, "channel select", chan_sel, swz_w, NULL);
+  }
+  return err;
+}
+
+
+static int src2_3src (FILE *file, const union GenNativeInstruction *inst)
+{
+  int err = 0;
+  uint32_t swz_x = (inst->bits3.da3src.src2_swizzle >> 0) & 0x3;
+  uint32_t swz_y = (inst->bits3.da3src.src2_swizzle >> 2) & 0x3;
+  uint32_t swz_z = (inst->bits3.da3src.src2_swizzle >> 4) & 0x3;
+  uint32_t swz_w = (inst->bits3.da3src.src2_swizzle >> 6) & 0x3;
+
+  err |= control (file, "negate", negate, inst->bits1.da3src.src2_negate,
+      NULL);
+  err |= control (file, "abs", _abs, inst->bits1.da3src.src2_abs, NULL);
+
+  err |= reg (file, GEN_GENERAL_REGISTER_FILE,
+      inst->bits3.da3src.src2_reg_nr);
+  if (err == -1)
+    return 0;
+  if (inst->bits3.da3src.src2_subreg_nr)
+    format (file, ".%d", inst->bits3.da3src.src2_subreg_nr);
+  string (file, "<4,1,1>");
+  err |= control (file, "src da16 reg type", reg_encoding,
+      GEN_TYPE_F, NULL);
+  /*
+   * Three kinds of swizzle display:
+   *  identity - nothing printed
+   *  1->all	 - print the single channel
+   *  1->1     - print the mapping
+   */
+  if (swz_x == GEN_CHANNEL_X &&
+      swz_y == GEN_CHANNEL_Y &&
+      swz_z == GEN_CHANNEL_Z &&
+      swz_w == GEN_CHANNEL_W)
+  {
+    ;
+  }
+  else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+  {
+    string (file, ".");
+    err |= control (file, "channel select", chan_sel, swz_x, NULL);
+  }
+  else
+  {
+    string (file, ".");
+    err |= control (file, "channel select", chan_sel, swz_x, NULL);
+    err |= control (file, "channel select", chan_sel, swz_y, NULL);
+    err |= control (file, "channel select", chan_sel, swz_z, NULL);
+    err |= control (file, "channel select", chan_sel, swz_w, NULL);
+  }
+  return err;
+}
+
+static int imm (FILE *file, uint32_t type, const union GenNativeInstruction *inst) {
+  switch (type) {
+    case GEN_TYPE_UD:
+      format (file, "0x%xUD", inst->bits3.ud);
+      break;
+    case GEN_TYPE_D:
+      format (file, "%dD", inst->bits3.d);
+      break;
+    case GEN_TYPE_UW:
+      format (file, "0x%xUW", (uint16_t) inst->bits3.ud);
+      break;
+    case GEN_TYPE_W:
+      format (file, "%dW", (int16_t) inst->bits3.d);
+      break;
+    case GEN_TYPE_UB:
+      format (file, "0x%xUB", (int8_t) inst->bits3.ud);
+      break;
+    case GEN_TYPE_VF:
+      format (file, "Vector Float");
+      break;
+    case GEN_TYPE_V:
+      format (file, "0x%xV", inst->bits3.ud);
+      break;
+    case GEN_TYPE_F:
+      format (file, "%-gF", inst->bits3.f);
+  }
+  return 0;
+}
+
+static int src0 (FILE *file, const union GenNativeInstruction *inst)
+{
+  if (inst->bits1.da1.src0_reg_file == GEN_IMMEDIATE_VALUE)
+    return imm (file, inst->bits1.da1.src0_reg_type,
+        inst);
+  else if (inst->header.access_mode == GEN_ALIGN_1)
+  {
+    if (inst->bits2.da1.src0_address_mode == GEN_ADDRESS_DIRECT)
+    {
+      return src_da1 (file,
+          inst->bits1.da1.src0_reg_type,
+          inst->bits1.da1.src0_reg_file,
+          inst->bits2.da1.src0_vert_stride,
+          inst->bits2.da1.src0_width,
+          inst->bits2.da1.src0_horiz_stride,
+          inst->bits2.da1.src0_reg_nr,
+          inst->bits2.da1.src0_subreg_nr,
+          inst->bits2.da1.src0_abs,
+          inst->bits2.da1.src0_negate);
+    }
+    else
+    {
+      return src_ia1 (file,
+          inst->bits1.ia1.src0_reg_type,
+          inst->bits1.ia1.src0_reg_file,
+          inst->bits2.ia1.src0_indirect_offset,
+          inst->bits2.ia1.src0_subreg_nr,
+          inst->bits2.ia1.src0_negate,
+          inst->bits2.ia1.src0_abs,
+          inst->bits2.ia1.src0_address_mode,
+          inst->bits2.ia1.src0_horiz_stride,
+          inst->bits2.ia1.src0_width,
+          inst->bits2.ia1.src0_vert_stride);
+    }
+  }
+  else
+  {
+    if (inst->bits2.da16.src0_address_mode == GEN_ADDRESS_DIRECT)
+    {
+      return src_da16 (file,
+          inst->bits1.da16.src0_reg_type,
+          inst->bits1.da16.src0_reg_file,
+          inst->bits2.da16.src0_vert_stride,
+          inst->bits2.da16.src0_reg_nr,
+          inst->bits2.da16.src0_subreg_nr,
+          inst->bits2.da16.src0_abs,
+          inst->bits2.da16.src0_negate,
+          inst->bits2.da16.src0_swz_x,
+          inst->bits2.da16.src0_swz_y,
+          inst->bits2.da16.src0_swz_z,
+          inst->bits2.da16.src0_swz_w);
+    }
+    else
+    {
+      string (file, "Indirect align16 address mode not supported");
+      return 1;
+    }
+  }
+}
+
+static int src1 (FILE *file, const union GenNativeInstruction *inst)
+{
+  if (inst->bits1.da1.src1_reg_file == GEN_IMMEDIATE_VALUE)
+    return imm (file, inst->bits1.da1.src1_reg_type,
+        inst);
+  else if (inst->header.access_mode == GEN_ALIGN_1)
+  {
+    if (inst->bits3.da1.src1_address_mode == GEN_ADDRESS_DIRECT)
+    {
+      return src_da1 (file,
+          inst->bits1.da1.src1_reg_type,
+          inst->bits1.da1.src1_reg_file,
+          inst->bits3.da1.src1_vert_stride,
+          inst->bits3.da1.src1_width,
+          inst->bits3.da1.src1_horiz_stride,
+          inst->bits3.da1.src1_reg_nr,
+          inst->bits3.da1.src1_subreg_nr,
+          inst->bits3.da1.src1_abs,
+          inst->bits3.da1.src1_negate);
+    }
+    else
+    {
+      return src_ia1 (file,
+          inst->bits1.ia1.src1_reg_type,
+          inst->bits1.ia1.src1_reg_file,
+          inst->bits3.ia1.src1_indirect_offset,
+          inst->bits3.ia1.src1_subreg_nr,
+          inst->bits3.ia1.src1_negate,
+          inst->bits3.ia1.src1_abs,
+          inst->bits3.ia1.src1_address_mode,
+          inst->bits3.ia1.src1_horiz_stride,
+          inst->bits3.ia1.src1_width,
+          inst->bits3.ia1.src1_vert_stride);
+    }
+  }
+  else
+  {
+    if (inst->bits3.da16.src1_address_mode == GEN_ADDRESS_DIRECT)
+    {
+      return src_da16 (file,
+          inst->bits1.da16.src1_reg_type,
+          inst->bits1.da16.src1_reg_file,
+          inst->bits3.da16.src1_vert_stride,
+          inst->bits3.da16.src1_reg_nr,
+          inst->bits3.da16.src1_subreg_nr,
+          inst->bits3.da16.src1_abs,
+          inst->bits3.da16.src1_negate,
+          inst->bits3.da16.src1_swz_x,
+          inst->bits3.da16.src1_swz_y,
+          inst->bits3.da16.src1_swz_z,
+          inst->bits3.da16.src1_swz_w);
+    }
+    else
+    {
+      string (file, "Indirect align16 address mode not supported");
+      return 1;
+    }
+  }
+}
+
+static const int esize[6] = {
+  [0] = 1,
+  [1] = 2,
+  [2] = 4,
+  [3] = 8,
+  [4] = 16,
+  [5] = 32,
+};
+
+static int qtr_ctrl(FILE *file, const union GenNativeInstruction *inst)
+{
+  int qtr_ctl = inst->header.quarter_control;
+  int exec_size = esize[inst->header.execution_size];
+
+  if (exec_size == 8) {
+    switch (qtr_ctl) {
+      case 0:
+        string (file, " 1Q");
+        break;
+      case 1:
+        string (file, " 2Q");
+        break;
+      case 2:
+        string (file, " 3Q");
+        break;
+      case 3:
+        string (file, " 4Q");
+        break;
+    }
+  } else if (exec_size == 16){
+    if (qtr_ctl < 2)
+      string (file, " 1H");
+    else
+      string (file, " 2H");
+  }
+  return 0;
+}
+
+int gen_disasm (FILE *file, const void *opaque_insn, uint32_t deviceID, uint32_t compacted)
+{
+  const union GenNativeInstruction *inst = (const union GenNativeInstruction *) opaque_insn;
+  int	err = 0;
+  int space = 0;
+  int gen = 70;
+  if (IS_IVYBRIDGE(deviceID)) {
+    gen = 70;
+  } else if (IS_HASWELL(deviceID)) {
+    gen = 75;
+  }
+
+  if (inst->header.predicate_control) {
+    string (file, "(");
+    err |= control (file, "predicate inverse", pred_inv, inst->header.predicate_inverse, NULL);
+    format (file, "f%d", inst->bits2.da1.flag_reg_nr);
+    if (inst->bits2.da1.flag_sub_reg_nr)
+      format (file, ".%d", inst->bits2.da1.flag_sub_reg_nr);
+    if (inst->header.access_mode == GEN_ALIGN_1)
+      err |= control (file, "predicate control align1", pred_ctrl_align1,
+          inst->header.predicate_control, NULL);
+    else
+      err |= control (file, "predicate control align16", pred_ctrl_align16,
+          inst->header.predicate_control, NULL);
+    string (file, ") ");
+  }
+
+  err |= print_opcode (file, inst->header.opcode);
+  err |= control (file, "saturate", saturate, inst->header.saturate, NULL);
+  err |= control (file, "debug control", debug_ctrl, inst->header.debug_control, NULL);
+
+  if (inst->header.opcode == GEN_OPCODE_MATH) {
+    string (file, " ");
+    err |= control (file, "function", math_function,
+        inst->header.destreg_or_condmod, NULL);
+  } else if (inst->header.opcode != GEN_OPCODE_SEND &&
+      inst->header.opcode != GEN_OPCODE_SENDC) {
+    err |= control (file, "conditional modifier", conditional_modifier,
+                    inst->header.destreg_or_condmod, NULL);
+    if (inst->header.destreg_or_condmod)
+      err |= flag_reg (file,
+                       inst->bits2.da1.flag_reg_nr,
+                       inst->bits2.da1.flag_sub_reg_nr);
+  }
+
+  if (inst->header.opcode != GEN_OPCODE_NOP) {
+    string (file, "(");
+    err |= control (file, "execution size", exec_size, inst->header.execution_size, NULL);
+    string (file, ")");
+  }
+
+  if (inst->header.opcode == GEN_OPCODE_SEND && gen < 60)
+    format (file, " %d", inst->header.destreg_or_condmod);
+
+  if (opcode[inst->header.opcode].nsrc == 3) {
+    pad (file, 16);
+    err |= dest_3src (file, inst);
+
+    pad (file, 32);
+    err |= src0_3src (file, inst);
+
+    pad (file, 48);
+    err |= src1_3src (file, inst);
+
+    pad (file, 64);
+    err |= src2_3src (file, inst);
+  } else {
+    if (opcode[inst->header.opcode].ndst > 0) {
+      pad (file, 16);
+      err |= dest (file, inst);
+    } else if (gen >= 60 && (inst->header.opcode == GEN_OPCODE_IF ||
+          inst->header.opcode == GEN_OPCODE_ELSE ||
+          inst->header.opcode == GEN_OPCODE_ENDIF ||
+          inst->header.opcode == GEN_OPCODE_WHILE ||
+          inst->header.opcode == GEN_OPCODE_BRD ||
+          inst->header.opcode == GEN_OPCODE_JMPI)) {
+      format(file, " %d", (int16_t)inst->bits3.gen7_branch.jip);
+    } else if (gen >= 60 && (inst->header.opcode == GEN_OPCODE_BREAK ||
+          inst->header.opcode == GEN_OPCODE_CONTINUE ||
+          inst->header.opcode == GEN_OPCODE_HALT ||
+          inst->header.opcode == GEN_OPCODE_BRC)) {
+      format (file, " %d %d", inst->bits3.gen7_branch.jip, inst->bits3.gen7_branch.uip);
+    }/* else if (inst->header.opcode == GEN_OPCODE_JMPI) {
+      format (file, " %d", inst->bits3.d);
+    }*/
+
+    if (opcode[inst->header.opcode].nsrc > 0) {
+      pad (file, 32);
+      err |= src0 (file, inst);
+    }
+    if (opcode[inst->header.opcode].nsrc > 1) {
+      pad (file, 48);
+      err |= src1 (file, inst);
+    }
+  }
+
+  if (inst->header.opcode == GEN_OPCODE_SEND ||
+      inst->header.opcode == GEN_OPCODE_SENDC) {
+    enum GenMessageTarget target = inst->header.destreg_or_condmod;
+
+    newline (file);
+    pad (file, 16);
+    space = 0;
+
+    if(gen == 75) {
+      err |= control (file, "target function", target_function_gen75,
+             target, &space);
+    } else {
+      err |= control (file, "target function", target_function_gen6,
+             target, &space);
+    }
+
+    switch (target) {
+      case GEN_SFID_MATH:
+        err |= control (file, "math function", math_function,
+            inst->bits3.math_gen5.function, &space);
+        err |= control (file, "math saturate", math_saturate,
+            inst->bits3.math_gen5.saturate, &space);
+        err |= control (file, "math signed", math_signed,
+            inst->bits3.math_gen5.int_type, &space);
+        err |= control (file, "math scalar", math_scalar,
+            inst->bits3.math_gen5.data_type, &space);
+        err |= control (file, "math precision", math_precision,
+            inst->bits3.math_gen5.precision, &space);
+        break;
+      case GEN_SFID_SAMPLER:
+        format (file, " (%d, %d, %d, %d)",
+                inst->bits3.sampler_gen7.bti,
+                inst->bits3.sampler_gen7.sampler,
+                inst->bits3.sampler_gen7.msg_type,
+                inst->bits3.sampler_gen7.simd_mode);
+        break;
+      case GEN_SFID_DATAPORT_DATA_CACHE:
+        if(inst->bits3.gen7_untyped_rw.category == 0) {
+          format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
+                  inst->bits3.gen7_untyped_rw.bti,
+                  inst->bits3.gen7_untyped_rw.rgba,
+                  data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
+                  data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
+                  data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
+        } else {
+          format (file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
+                  inst->bits3.gen7_scratch_rw.offset,
+                  data_port_scratch_block_size[inst->bits3.gen7_scratch_rw.block_size],
+                  data_port_scratch_invalidate[inst->bits3.gen7_scratch_rw.invalidate_after_read],
+                  data_port_scratch_channel_mode[inst->bits3.gen7_scratch_rw.channel_mode],
+                  data_port_scratch_msg_type[inst->bits3.gen7_scratch_rw.msg_type]);
+        }
+        break;
+      case GEN_SFID_DATAPORT1_DATA_CACHE:
+        format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
+                inst->bits3.gen7_untyped_rw.bti,
+                inst->bits3.gen7_untyped_rw.rgba,
+                data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
+                data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
+                data_port1_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
+        break;
+      case GEN6_SFID_DATAPORT_CONSTANT_CACHE:
+        format (file, " (bti: %d, %s)",
+                inst->bits3.gen7_dword_rw.bti,
+                data_port_data_cache_msg_type[inst->bits3.gen7_dword_rw.msg_type]);
+        break;
+      case GEN_SFID_MESSAGE_GATEWAY:
+        format (file, " (subfunc: %s, notify: %d, ackreq: %d)",
+            gateway_sub_function[inst->bits3.gen7_msg_gw.subfunc],
+            inst->bits3.gen7_msg_gw.notify,
+            inst->bits3.gen7_msg_gw.ackreq);
+        break;
+
+      default:
+        format (file, "unsupported target %d", target);
+        break;
+    }
+    if (space)
+      string (file, " ");
+    format (file, "mlen %d", inst->bits3.generic_gen5.msg_length);
+    format (file, " rlen %d", inst->bits3.generic_gen5.response_length);
+  }
+  pad (file, 64);
+  if (inst->header.opcode != GEN_OPCODE_NOP) {
+    string (file, "{");
+    space = 1;
+    err |= control(file, "access mode", access_mode, inst->header.access_mode, &space);
+    if (gen >= 60)
+      err |= control (file, "write enable control", wectrl, inst->header.mask_control, &space);
+    else
+      err |= control (file, "mask control", mask_ctrl, inst->header.mask_control, &space);
+    err |= control (file, "dependency control", dep_ctrl, inst->header.dependency_control, &space);
+
+    err |= qtr_ctrl (file, inst);
+    err |= control (file, "thread control", thread_ctrl, inst->header.thread_control, &space);
+    if (gen >= 60)
+      err |= control (file, "acc write control", accwr, inst->header.acc_wr_control, &space);
+    if (inst->header.opcode == GEN_OPCODE_SEND ||
+        inst->header.opcode == GEN_OPCODE_SENDC)
+      err |= control (file, "end of thread", end_of_thread,
+          inst->bits3.generic_gen5.end_of_thread, &space);
+
+    if(compacted) {
+      string(file, " Compacted");
+    }
+    if (space)
+      string (file, " ");
+    string (file, "}");
+  }
+  string (file, ";");
+  newline (file);
+  return err;
+}
+
diff --git a/backend/src/backend/gen/gen_mesa_disasm.h b/backend/src/backend/gen/gen_mesa_disasm.h
new file mode 100644
index 0000000..ae007a4
--- /dev/null
+++ b/backend/src/backend/gen/gen_mesa_disasm.h
@@ -0,0 +1,45 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_mesa_disasm.h
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * To decode and print one Gen ISA instruction. The code is directly taken
+ * from Mesa
+ */
+
+#ifndef __GBE_GEN_MESA_DISASM_H__
+#define __GBE_GEN_MESA_DISASM_H__
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+extern int gen_disasm(FILE *file, const void *opaque_insn, uint32_t deviceID, uint32_t compacted);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* __GBE_GEN_MESA_DISASM_H__ */
+
+
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
new file mode 100644
index 0000000..da0db85
--- /dev/null
+++ b/backend/src/backend/gen75_context.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file gen75_context.cpp
+ */
+
+#include "backend/gen75_context.hpp"
+#include "backend/gen75_encoder.hpp"
+#include "backend/gen_program.hpp"
+#include "backend/gen_defs.hpp"
+#include "backend/gen_encoder.hpp"
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_insn_scheduling.hpp"
+#include "backend/gen_reg_allocation.hpp"
+#include "sys/cvar.hpp"
+#include "ir/function.hpp"
+#include "ir/value.hpp"
+#include <cstring>
+
+namespace gbe
+{
+  void Gen75Context::emitSLMOffset(void) {
+    if(kernel->getUseSLM() == false)
+      return;
+
+    const GenRegister slm_offset = ra->genReg(GenRegister::ud1grf(ir::ocl::slmoffset));
+    const GenRegister slm_index = GenRegister::ud1grf(0, 0);
+    //the slm index is hold in r0.0 24-27 bit, in 4K unit, shift left 12 to get byte unit
+    p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->SHR(slm_offset, slm_index, GenRegister::immud(12));
+    p->pop();
+  }
+
+  void Gen75Context::allocSLMOffsetCurbe(void) {
+    if(fn.getUseSLM())
+      allocCurbeReg(ir::ocl::slmoffset, GBE_CURBE_SLM_OFFSET);
+  }
+
+  uint32_t Gen75Context::alignScratchSize(uint32_t size){
+    if(size == 0)
+      return 0;
+    uint32_t i = 2048;
+    while(i < size) i *= 2;
+    return i;
+  }
+
+  void Gen75Context::emitStackPointer(void) {
+    using namespace ir;
+
+    // Only emit stack pointer computation if we use a stack
+    if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
+      return;
+
+    // Check that everything is consistent in the kernel code
+    const uint32_t perLaneSize = kernel->getStackSize();
+    const uint32_t perThreadSize = perLaneSize * this->simdWidth;
+    GBE_ASSERT(perLaneSize > 0);
+    GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
+    GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
+
+    // Use shifts rather than muls which are limited to 32x16 bit sources
+    const uint32_t perLaneShift = logi2(perLaneSize);
+    const uint32_t perThreadShift = logi2(perThreadSize);
+    const GenRegister selStatckPtr = this->simdWidth == 8 ?
+      GenRegister::ud8grf(ir::ocl::stackptr) :
+      GenRegister::ud16grf(ir::ocl::stackptr);
+    const GenRegister stackptr = ra->genReg(selStatckPtr);
+    const GenRegister selStackBuffer = GenRegister::ud1grf(ir::ocl::stackbuffer);
+    const GenRegister bufferptr = ra->genReg(selStackBuffer);
+
+    // We compute the per-lane stack pointer here
+    p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
+      p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f));
+      p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
+      p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
+      p->curr.execWidth = this->simdWidth;
+      p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
+      p->curr.execWidth = 1;
+      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2));
+      p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
+      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
+      p->curr.execWidth = this->simdWidth;
+      p->ADD(stackptr, stackptr, bufferptr);
+      p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
+    p->pop();
+  }
+
+  void Gen75Context::newSelection(void) {
+    this->sel = GBE_NEW(Selection75, *this);
+  }
+}
diff --git a/backend/src/backend/gen75_context.hpp b/backend/src/backend/gen75_context.hpp
new file mode 100644
index 0000000..6f62b02
--- /dev/null
+++ b/backend/src/backend/gen75_context.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file gen75_context.hpp
+ */
+#ifndef __GBE_GEN75_CONTEXT_HPP__
+#define __GBE_GEN75_CONTEXT_HPP__
+
+#include "backend/gen_context.hpp"
+#include "backend/gen75_encoder.hpp"
+
+namespace gbe
+{
+  /* This class is used to implement the HSW
+     specific logic for context. */
+  class Gen75Context : public GenContext
+  {
+  public:
+    virtual ~Gen75Context(void) { }
+    Gen75Context(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
+            : GenContext(unit, name, deviceID, relaxMath) {
+    };
+    /*! device's max srcatch buffer size */
+    #define GEN75_SCRATCH_SIZE  (2 * KB * KB)
+    /*! Emit the per-lane stack pointer computation */
+    virtual void emitStackPointer(void);
+    /*! Align the scratch size to the device's scratch unit size */
+    virtual uint32_t alignScratchSize(uint32_t size);
+    /*! Get the device's max srcatch size */
+    virtual uint32_t getScratchSize(void) {
+      //Because the allocate is use uint16_t, so clamp it, need refine
+      return std::min(GEN75_SCRATCH_SIZE, 0x7fff);
+    }
+
+  protected:
+    virtual GenEncoder* generateEncoder(void) {
+      return GBE_NEW(Gen75Encoder, this->simdWidth, 75, deviceID);
+    }
+
+  private:
+    virtual void emitSLMOffset(void);
+    virtual void allocSLMOffsetCurbe(void);
+    virtual void newSelection(void);
+  };
+}
+#endif /* __GBE_GEN75_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen75_encoder.cpp b/backend/src/backend/gen75_encoder.cpp
new file mode 100644
index 0000000..69d2de0
--- /dev/null
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -0,0 +1,269 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+
+#include "backend/gen75_encoder.hpp"
+
+static const uint32_t untypedRWMask[] = {
+  GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN|GEN_UNTYPED_RED,
+  GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN,
+  GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE,
+  GEN_UNTYPED_ALPHA,
+  0
+};
+
+namespace gbe
+{
+  void Gen75Encoder::setHeader(GenNativeInstruction *insn) {
+    if (this->curr.execWidth == 8)
+      insn->header.execution_size = GEN_WIDTH_8;
+    else if (this->curr.execWidth == 16)
+      insn->header.execution_size = GEN_WIDTH_16;
+    else if (this->curr.execWidth == 1)
+      insn->header.execution_size = GEN_WIDTH_1;
+    else if (this->curr.execWidth == 4)
+      insn->header.execution_size = GEN_WIDTH_4;
+    else
+      NOT_IMPLEMENTED;
+    insn->header.acc_wr_control = this->curr.accWrEnable;
+    insn->header.quarter_control = this->curr.quarterControl;
+    insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
+    insn->header.mask_control = this->curr.noMask;
+    insn->bits2.ia1.flag_reg_nr = this->curr.flag;
+    insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+    if (this->curr.predicate != GEN_PREDICATE_NONE) {
+      insn->header.predicate_control = this->curr.predicate;
+      insn->header.predicate_inverse = this->curr.inversePredicate;
+    }
+    insn->header.saturate = this->curr.saturate;
+  }
+
+  void Gen75Encoder::setDPUntypedRW(GenNativeInstruction *insn,
+                                    uint32_t bti,
+                                    uint32_t rgba,
+                                    uint32_t msg_type,
+                                    uint32_t msg_length,
+                                    uint32_t response_length)
+  {
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA_CACHE;
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_untyped_rw.msg_type = msg_type;
+    insn->bits3.gen7_untyped_rw.bti = bti;
+    insn->bits3.gen7_untyped_rw.rgba = rgba;
+    if (curr.execWidth == 8)
+      insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD8;
+    else if (curr.execWidth == 16)
+      insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD16;
+    else
+      NOT_SUPPORTED;
+  }
+
+  void Gen75Encoder::setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
+                                          unsigned char msg_type, uint32_t msg_length, bool header_present)
+  {
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA_CACHE;
+    setMessageDescriptor(insn, sfid, msg_length, 0, header_present);
+    insn->bits3.gen7_typed_rw.bti = bti;
+    insn->bits3.gen7_typed_rw.msg_type = msg_type;
+
+    /* Always using the low 8 slots here. */
+    insn->bits3.gen7_typed_rw.slot = 1;
+  }
+
+  void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+
+    if (this->curr.execWidth == 8) {
+      msg_length = srcNum;
+      response_length = 1;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2 * srcNum;
+      response_length = 2;
+    } else
+      NOT_IMPLEMENTED;
+
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA_CACHE;
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_atomic_op.msg_type = GEN75_P1_UNTYPED_ATOMIC_OP;
+    insn->bits3.gen7_atomic_op.bti = bti;
+    insn->bits3.gen7_atomic_op.return_data = 1;
+    insn->bits3.gen7_atomic_op.aop_type = function;
+
+    if (this->curr.execWidth == 8)
+      insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD8;
+    else if (this->curr.execWidth == 16)
+      insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
+    else
+      NOT_SUPPORTED;
+  }
+
+  void Gen75Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    if (this->curr.execWidth == 8) {
+      msg_length = 1;
+      response_length = elemNum;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2;
+      response_length = 2 * elemNum;
+    } else
+      NOT_IMPLEMENTED;
+
+    this->setHeader(insn);
+    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(insn,
+                   bti,
+                   untypedRWMask[elemNum],
+                   GEN75_P1_UNTYPED_READ,
+                   msg_length,
+                   response_length);
+  }
+
+  void Gen75Encoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    this->setHeader(insn);
+    if (this->curr.execWidth == 8) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+      msg_length = 1 + elemNum;
+    } else if (this->curr.execWidth == 16) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+      msg_length = 2 * (1 + elemNum);
+    }
+    else
+      NOT_IMPLEMENTED;
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(insn,
+                   bti,
+                   untypedRWMask[elemNum],
+                   GEN75_P1_UNTYPED_SURFACE_WRITE,
+                   msg_length,
+                   response_length);
+  }
+
+  void Gen75Encoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
+    union { double d; unsigned u[2]; } u;
+    u.d = value;
+    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
+    push();
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    curr.execWidth = 1;
+    MOV(r, GenRegister::immud(u.u[0]));
+    MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[1]));
+    pop();
+    r.type = GEN_TYPE_DF;
+    r.vstride = GEN_VERTICAL_STRIDE_0;
+    r.width = GEN_WIDTH_1;
+    r.hstride = GEN_HORIZONTAL_STRIDE_0;
+    push();
+    uint32_t width = curr.execWidth;
+    curr.execWidth = 8;
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    curr.quarterControl = GEN_COMPRESSION_Q1;
+    MOV(dest, r);
+    if (width == 16) {
+      curr.quarterControl = GEN_COMPRESSION_Q2;
+      MOV(GenRegister::offset(dest, 2), r);
+    }
+    pop();
+  }
+
+  void Gen75Encoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
+    GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && dest.type == GEN_TYPE_F));
+    int w = curr.execWidth;
+    GenRegister r0;
+    r0 = GenRegister::h2(r);
+    push();
+    curr.execWidth = 4;
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    MOV(r0, src0);
+    MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
+    curr.noMask = 0;
+    curr.quarterControl = 0;
+    curr.nibControl = 0;
+    MOV(dest, r0);
+    curr.nibControl = 1;
+    MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r0, 4));
+    pop();
+    if (w == 16) {
+      push();
+      curr.execWidth = 4;
+      curr.predicate = GEN_PREDICATE_NONE;
+      curr.noMask = 1;
+      MOV(r0, GenRegister::suboffset(src0, 8));
+      MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12));
+      curr.noMask = 0;
+      curr.quarterControl = 1;
+      curr.nibControl = 0;
+      MOV(GenRegister::suboffset(dest, 8), r0);
+      curr.nibControl = 1;
+      MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r0, 4));
+      pop();
+    }
+  }
+
+  void Gen75Encoder::JMPI(GenRegister src, bool longjmp) {
+    alu2(this, GEN_OPCODE_JMPI, GenRegister::ip(), GenRegister::ip(), src);
+  }
+
+  void Gen75Encoder::patchJMPI(uint32_t insnID, int32_t jumpDistance) {
+    GenNativeInstruction &insn = *(GenNativeInstruction *)&this->store[insnID];
+    GBE_ASSERT(insnID < this->store.size());
+    GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
+               insn.header.opcode == GEN_OPCODE_BRD  ||
+               insn.header.opcode == GEN_OPCODE_ENDIF ||
+               insn.header.opcode == GEN_OPCODE_IF ||
+               insn.header.opcode == GEN_OPCODE_BRC);
+
+    if (insn.header.opcode == GEN_OPCODE_IF) {
+      this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+      return;
+    }
+    else if (insn.header.opcode == GEN_OPCODE_JMPI) {
+      //jumpDistance'unit is Qword, and the HSW's offset of jmpi is in byte, so multi 8
+      jumpDistance = (jumpDistance - 2) * 8;
+    }
+
+    this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+  }
+} /* End of the name space. */
diff --git a/backend/src/backend/gen75_encoder.hpp b/backend/src/backend/gen75_encoder.hpp
new file mode 100644
index 0000000..c10dac9
--- /dev/null
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file gen75_context.hpp
+ */
+#ifndef __GBE_GEN75_ENCODER_HPP__
+#define __GBE_GEN75_ENCODER_HPP__
+
+#include "backend/gen_encoder.hpp"
+
+namespace gbe
+{
+  /* This class is used to implement the HSW
+     specific logic for encoder. */
+  class Gen75Encoder : public GenEncoder
+  {
+  public:
+    /*! exec width of the double data type */    
+    #define GEN75_DOUBLE_EXEC_WIDTH  4
+    virtual ~Gen75Encoder(void) { }
+
+    Gen75Encoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID)
+         : GenEncoder(simdWidth, gen, deviceID) { }
+
+    /*! Jump indexed instruction */
+    virtual void JMPI(GenRegister src, bool longjmp = false);
+    /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
+    virtual void patchJMPI(uint32_t insnID, int32_t jumpDistance);
+    /*! Get double/long exec width */
+    virtual int getDoubleExecWidth(void) { return GEN75_DOUBLE_EXEC_WIDTH; }
+    virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
+    virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
+    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
+    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+    virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+    virtual void setHeader(GenNativeInstruction *insn);
+    virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
+                   uint32_t msg_type, uint32_t msg_length, uint32_t response_length);
+    virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
+                                      unsigned char msg_type, uint32_t msg_length,
+                                      bool header_present);
+  };
+}
+#endif /* __GBE_GEN75_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
new file mode 100644
index 0000000..4f697ef
--- /dev/null
+++ b/backend/src/backend/gen_context.cpp
@@ -0,0 +1,1911 @@
+/*
+ * Copyright © 2012 Intel Corporatin
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_context.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "backend/gen_context.hpp"
+#include "backend/gen_program.hpp"
+#include "backend/gen_defs.hpp"
+#include "backend/gen_encoder.hpp"
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_insn_scheduling.hpp"
+#include "backend/gen_reg_allocation.hpp"
+#include "backend/gen/gen_mesa_disasm.h"
+#include "ir/function.hpp"
+#include "ir/value.hpp"
+#include "sys/cvar.hpp"
+#include <cstring>
+#include <iostream>
+#include <iomanip>
+
+namespace gbe
+{
+  ///////////////////////////////////////////////////////////////////////////
+  // GenContext implementation
+  ///////////////////////////////////////////////////////////////////////////
+  GenContext::GenContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID,
+	     bool relaxMath) :
+    Context(unit, name), deviceID(deviceID), relaxMath(relaxMath)
+  {
+    this->p = NULL;
+    this->sel = NULL;
+    this->ra = NULL;
+    this->ifEndifFix = false;
+  }
+
+  GenContext::~GenContext(void) {
+    GBE_DELETE(this->ra);
+    GBE_DELETE(this->sel);
+    GBE_DELETE(this->p);
+  }
+
+  void GenContext::startNewCG(uint32_t simdWidth, uint32_t reservedSpillRegs, bool limitRegisterPressure) {
+    this->limitRegisterPressure = limitRegisterPressure;
+    this->reservedSpillRegs = reservedSpillRegs;
+    Context::startNewCG(simdWidth);
+    GBE_SAFE_DELETE(ra);
+    GBE_SAFE_DELETE(sel);
+    GBE_SAFE_DELETE(p);
+    this->p = generateEncoder();
+    this->newSelection();
+    this->ra = GBE_NEW(GenRegAllocator, *this);
+    this->branchPos2.clear();
+    this->branchPos3.clear();
+    this->labelPos.clear();
+    this->errCode = NO_ERROR;
+  }
+
+  void GenContext::newSelection(void) {
+    this->sel = GBE_NEW(Selection, *this);
+  }
+
+  uint32_t GenContext::alignScratchSize(uint32_t size){
+    uint32_t i = 0;
+    while(i < size) i+=1024;
+    return i;
+  }
+
+  void GenContext::emitInstructionStream(void) {
+    // Emit Gen ISA
+    for (auto &block : *sel->blockList)
+    for (auto &insn : block.insnList) {
+      const uint32_t opcode = insn.opcode;
+      p->push();
+      // no more virtual register here in that part of the code generation
+      GBE_ASSERT(insn.state.physicalFlag);
+      p->curr = insn.state;
+      switch (opcode) {
+#define DECL_SELECTION_IR(OPCODE, FAMILY) \
+  case SEL_OP_##OPCODE: this->emit##FAMILY(insn); break;
+#include "backend/gen_insn_selection.hxx"
+#undef DECL_INSN
+      }
+      p->pop();
+    }
+    /* per spec, pad the instruction stream with 8 nop to avoid
+	instruction prefetcher prefetch into an invalide page */
+    for(int i = 0; i < 8; i++)
+	p->NOP();
+  }
+
+  bool GenContext::patchBranches(void) {
+    using namespace ir;
+    for (auto pair : branchPos2) {
+      const LabelIndex label = pair.first;
+      const int32_t insnID = pair.second;
+      const int32_t targetID = labelPos.find(label)->second;
+      p->patchJMPI(insnID, (targetID - insnID));
+    }
+    for (auto pair : branchPos3) {
+      const LabelPair labelPair = pair.first;
+      const int32_t insnID = pair.second;
+      const int32_t jip = labelPos.find(labelPair.l0)->second;
+      const int32_t uip = labelPos.find(labelPair.l1)->second;
+      if (((jip - insnID) > 32767 || (jip - insnID) < -32768) ||
+          ((uip - insnID) > 32768 || (uip - insnID) < -32768)) {
+        // The only possible error instruction is if/endif here.
+        errCode = OUT_OF_RANGE_IF_ENDIF; 
+        return false;
+      }
+      p->patchJMPI(insnID, (((uip - insnID)) << 16) | ((jip - insnID)));
+    }
+    return true;
+  }
+
+  void GenContext::clearFlagRegister(void) {
+    // when group size not aligned to simdWidth, flag register need clear to
+    // make prediction(any8/16h) work correctly
+    const GenRegister blockip = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip));
+    const GenRegister zero = ra->genReg(GenRegister::uw1grf(ir::ocl::zero));
+    const GenRegister one = ra->genReg(GenRegister::uw1grf(ir::ocl::one));
+    p->push();
+      p->curr.noMask = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->MOV(blockip, GenRegister::immuw(GEN_MAX_LABEL));
+      p->curr.noMask = 0;
+      p->MOV(blockip, GenRegister::immuw(0));
+      p->curr.execWidth = 1;
+      // FIXME, need to get the final use set of zero/one, if there is no user,
+      // no need to generate the following two instructions.
+      p->MOV(zero, GenRegister::immuw(0));
+      p->MOV(one, GenRegister::immw(-1));
+    p->pop();
+  }
+
+  void GenContext::emitStackPointer(void) {
+    using namespace ir;
+
+    // Only emit stack pointer computation if we use a stack
+    if (kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) <= 0)
+      return;
+
+    // Check that everything is consistent in the kernel code
+    const uint32_t perLaneSize = kernel->getStackSize();
+    const uint32_t perThreadSize = perLaneSize * this->simdWidth;
+    GBE_ASSERT(perLaneSize > 0);
+    GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
+    GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
+
+    // Use shifts rather than muls which are limited to 32x16 bit sources
+    const uint32_t perLaneShift = logi2(perLaneSize);
+    const uint32_t perThreadShift = logi2(perThreadSize);
+    const GenRegister selStatckPtr = this->simdWidth == 8 ?
+      GenRegister::ud8grf(ir::ocl::stackptr) :
+      GenRegister::ud16grf(ir::ocl::stackptr);
+    const GenRegister stackptr = ra->genReg(selStatckPtr);
+    const GenRegister selStackBuffer = GenRegister::ud1grf(ir::ocl::stackbuffer);
+    const GenRegister bufferptr = ra->genReg(selStackBuffer);
+
+    // We compute the per-lane stack pointer here
+    p->push();
+      p->curr.execWidth = 1;
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
+      p->curr.execWidth = this->simdWidth;
+      p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
+      p->curr.execWidth = 1;
+      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
+      p->curr.execWidth = this->simdWidth;
+      p->ADD(stackptr, stackptr, bufferptr);
+      p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
+    p->pop();
+  }
+
+  void GenContext::emitLabelInstruction(const SelectionInstruction &insn) {
+    const ir::LabelIndex label(insn.index);
+    this->labelPos.insert(std::make_pair(label, p->store.size()));
+  }
+
+  void GenContext::emitUnaryInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    switch (insn.opcode) {
+      case SEL_OP_MOV: p->MOV(dst, src, insn.extra.function); break;
+      case SEL_OP_FBH: p->FBH(dst, src); break;
+      case SEL_OP_FBL: p->FBL(dst, src); break;
+      case SEL_OP_NOT: p->NOT(dst, src); break;
+      case SEL_OP_RNDD: p->RNDD(dst, src); break;
+      case SEL_OP_RNDU: p->RNDU(dst, src); break;
+      case SEL_OP_RNDE: p->RNDE(dst, src); break;
+      case SEL_OP_RNDZ: p->RNDZ(dst, src); break;
+      case SEL_OP_F16TO32: p->F16TO32(dst, src); break;
+      case SEL_OP_F32TO16: p->F32TO16(dst, src); break;
+      case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); break;
+      case SEL_OP_CONVI64_TO_I:
+       {
+        p->MOV(dst, src.bottom_half());
+        break;
+       }
+      case SEL_OP_BRC:
+        {
+          const ir::LabelIndex label0(insn.index), label1(insn.index1);
+          const LabelPair labelPair(label0, label1);
+          const GenRegister src = ra->genReg(insn.src(0));
+          this->branchPos3.push_back(std::make_pair(labelPair, p->store.size()));
+          p->BRC(src);
+        }
+        break;
+      case SEL_OP_BRD:
+        insertJumpPos(insn);
+        p->BRD(src);
+        break;
+      case SEL_OP_ENDIF:
+        insertJumpPos(insn);
+        p->ENDIF(src);
+        break;
+      case SEL_OP_IF:
+        {
+          const ir::LabelIndex label0(insn.index), label1(insn.index1);
+          const LabelPair labelPair(label0, label1);
+          const GenRegister src = ra->genReg(insn.src(0));
+          this->branchPos3.push_back(std::make_pair(labelPair, p->store.size()));
+          p->IF(src);
+        }
+        break;
+      default: NOT_IMPLEMENTED;
+    }
+  }
+
+  void GenContext::emitUnaryWithTempInstruction(const SelectionInstruction &insn) {
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister src = ra->genReg(insn.src(0));
+    GenRegister tmp = ra->genReg(insn.dst(1));
+    switch (insn.opcode) {
+      case SEL_OP_LOAD_DF_IMM:
+        p->LOAD_DF_IMM(dst, tmp, src.value.df);
+        break;
+      case SEL_OP_MOV_DF:
+        p->MOV_DF(dst, src, tmp);
+        break;
+      case SEL_OP_CONVI_TO_I64: {
+        GenRegister middle = src;
+        if(src.type == GEN_TYPE_B || src.type == GEN_TYPE_W) {
+          middle = tmp;
+          middle.type = GEN_TYPE_D;
+          p->MOV(middle, src);
+        }
+
+        p->MOV(dst.bottom_half(), middle);
+        if(src.is_signed_int())
+          p->ASR(dst.top_half(this->simdWidth), middle, GenRegister::immud(31));
+        else
+          p->MOV(dst.top_half(this->simdWidth), GenRegister::immud(0));
+        break;
+      }
+      default:
+        NOT_IMPLEMENTED;
+    }
+  }
+
+  void GenContext::emitBinaryWithTempInstruction(const SelectionInstruction &insn) {
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister src0 = ra->genReg(insn.src(0));
+    GenRegister src1 = ra->genReg(insn.src(1));
+    GenRegister tmp = ra->genReg(insn.dst(1));
+    switch (insn.opcode) {
+      case SEL_OP_I64ADD: {
+        tmp = GenRegister::retype(tmp, GEN_TYPE_UL);
+        GenRegister x = tmp.bottom_half();
+        GenRegister y = tmp.top_half(this->simdWidth);
+
+        loadBottomHalf(x, src0);
+        loadBottomHalf(y, src1);
+        addWithCarry(x, x, y);
+        storeBottomHalf(dst, x);
+        loadTopHalf(x, src0);
+        p->ADD(x, x, y);
+        loadTopHalf(y, src1);
+        p->ADD(x, x, y);
+        storeTopHalf(dst, x);
+        break;
+      }
+      case SEL_OP_I64SUB: {
+        tmp = GenRegister::retype(tmp, GEN_TYPE_UL);
+        GenRegister x = tmp.bottom_half();
+        GenRegister y = tmp.top_half(this->simdWidth);
+
+        loadBottomHalf(x, src0);
+        loadBottomHalf(y, src1);
+        subWithBorrow(x, x, y);
+        storeBottomHalf(dst, x);
+        loadTopHalf(x, src0);
+        subWithBorrow(x, x, y);
+        loadTopHalf(y, src1);
+        subWithBorrow(x, x, y);
+        storeTopHalf(dst, x);
+        break;
+      }
+      case SEL_OP_MUL_HI: {
+        int w = p->curr.execWidth;
+        p->push();
+        p->curr.execWidth = 8;
+        for (int i = 0; i < w / 8; i ++) {
+          p->push();
+          p->curr.predicate = GEN_PREDICATE_NONE;
+          p->curr.noMask = 1;
+          p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1);
+          p->curr.accWrEnable = 1;
+          p->MACH(tmp, src0, src1);
+          p->pop();
+          p->curr.quarterControl = i;
+          p->MOV(dst, tmp);
+          dst = GenRegister::Qn(dst, 1);
+          src0 = GenRegister::Qn(src0, 1);
+          src1 = GenRegister::Qn(src1, 1);
+        }
+        p->pop();
+        break;
+       }
+     case SEL_OP_HADD: {
+        int w = p->curr.execWidth;
+        p->push();
+        p->curr.execWidth = 8;
+        for (int i = 0; i < w / 8; i ++) {
+          p->curr.quarterControl = i;
+          p->ADDC(dst, src0, src1);
+          p->SHR(dst, dst, GenRegister::immud(1));
+          p->SHL(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
+          p->OR(dst, dst, tmp);
+          dst = GenRegister::Qn(dst, 1);
+          src0 = GenRegister::Qn(src0, 1);
+          src1 = GenRegister::Qn(src1, 1);
+        }
+        p->pop();
+        break;
+       }
+      case SEL_OP_RHADD: {
+        int w = p->curr.execWidth;
+        p->push();
+        p->curr.execWidth = 8;
+        for (int i = 0; i < w / 8; i ++) {
+          p->curr.quarterControl = i;
+          p->ADDC(dst, src0, src1);
+          p->ADD(dst, dst, GenRegister::immud(1));
+          p->SHR(dst, dst, GenRegister::immud(1));
+          p->SHL(tmp, GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), GenRegister::immud(31));
+          p->OR(dst, dst, tmp);
+          dst = GenRegister::Qn(dst, 1);
+          src0 = GenRegister::Qn(src0, 1);
+          src1 = GenRegister::Qn(src1, 1);
+        }
+        p->pop();
+        break;
+       }
+      default:
+        NOT_IMPLEMENTED;
+    }
+  }
+
+  void GenContext::emitBinaryInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src0 = ra->genReg(insn.src(0));
+    const GenRegister src1 = ra->genReg(insn.src(1));
+    switch (insn.opcode) {
+      case SEL_OP_SEL:  p->SEL(dst, src0, src1); break;
+      case SEL_OP_SEL_INT64:
+        {
+          p->SEL(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+          p->SEL(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
+        }
+        break;
+      case SEL_OP_AND:  p->AND(dst, src0, src1, insn.extra.function); break;
+      case SEL_OP_OR:   p->OR (dst, src0, src1, insn.extra.function);  break;
+      case SEL_OP_XOR:  p->XOR(dst, src0, src1, insn.extra.function); break;
+      case SEL_OP_I64AND:
+        {
+          p->AND(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+          p->AND(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
+        }
+        break;
+      case SEL_OP_I64OR:
+        {
+          p->OR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+          p->OR(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
+        }
+        break;
+      case SEL_OP_I64XOR:
+        {
+          p->XOR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half());
+          p->XOR(dst.top_half(this->simdWidth), src0.top_half(this->simdWidth), src1.top_half(this->simdWidth));
+        }
+        break;
+      case SEL_OP_SHR:  p->SHR(dst, src0, src1); break;
+      case SEL_OP_SHL:  p->SHL(dst, src0, src1); break;
+      case SEL_OP_RSR:  p->RSR(dst, src0, src1); break;
+      case SEL_OP_RSL:  p->RSL(dst, src0, src1); break;
+      case SEL_OP_ASR:  p->ASR(dst, src0, src1); break;
+      case SEL_OP_ADD:  p->ADD(dst, src0, src1); break;
+      case SEL_OP_MUL:  p->MUL(dst, src0, src1); break;
+      case SEL_OP_MACH: p->MACH(dst, src0, src1); break;
+      case SEL_OP_UPSAMPLE_SHORT: p->UPSAMPLE_SHORT(dst, src0, src1); break;
+      case SEL_OP_UPSAMPLE_INT: p->UPSAMPLE_INT(dst, src0, src1); break;
+      case SEL_OP_UPSAMPLE_LONG:
+        {
+          GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
+                      xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL),
+                      xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL);
+          p->MOV(xdst.top_half(this->simdWidth), xsrc0.bottom_half());
+          p->MOV(xdst.bottom_half(), xsrc1.bottom_half());
+        }
+        break;
+      default: NOT_IMPLEMENTED;
+    }
+  }
+
+  void GenContext::collectShifter(GenRegister dest, GenRegister src) {
+    p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+    p->AND(dest, src.bottom_half(), GenRegister::immud(63));
+    p->pop();
+  }
+
+  void GenContext::I64FullAdd(GenRegister high1, GenRegister low1, GenRegister high2, GenRegister low2) {
+    addWithCarry(low1, low1, low2);
+    addWithCarry(high1, high1, high2);
+    p->ADD(high1, high1, low2);
+  }
+
+  void GenContext::I64FullMult(GenRegister dst1, GenRegister dst2, GenRegister dst3, GenRegister dst4, GenRegister x_high, GenRegister x_low, GenRegister y_high, GenRegister y_low) {
+    GenRegister &e = dst1, &f = dst2, &g = dst3, &h = dst4,
+                &a = x_high, &b = x_low, &c = y_high, &d = y_low;
+    I32FullMult(e, h, b, d);
+    I32FullMult(f, g, a, d);
+    addWithCarry(g, g, e);
+    addWithCarry(f, f, e);
+    I32FullMult(e, d, b, c);
+    I64FullAdd(f, g, e, d);
+    I32FullMult(b, d, a, c);
+    I64FullAdd(e, f, b, d);
+  }
+
+  void GenContext::I64Neg(GenRegister high, GenRegister low, GenRegister tmp) {
+    p->NOT(high, high);
+    p->NOT(low, low);
+    p->MOV(tmp, GenRegister::immud(1));
+    addWithCarry(low, low, tmp);
+    p->ADD(high, high, tmp);
+  }
+
+  void GenContext::I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg) {
+    p->SHR(sign, high, GenRegister::immud(31));
+    p->push();
+    p->curr.noMask = 1;
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+    p->CMP(GEN_CONDITIONAL_NZ, sign, GenRegister::immud(0));
+    p->curr.predicate = GEN_PREDICATE_NORMAL;
+    I64Neg(high, low, tmp);
+    p->pop();
+  }
+
+  void GenContext::emitI64MULHIInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GenRegister f = ra->genReg(insn.dst(6));
+    GenRegister g = ra->genReg(insn.dst(7));
+    GenRegister h = ra->genReg(insn.dst(8));
+    GenRegister i = ra->genReg(insn.dst(9));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+    loadTopHalf(a, x);
+    loadBottomHalf(b, x);
+    loadTopHalf(c, y);
+    loadBottomHalf(d, y);
+    if(x.type == GEN_TYPE_UL) {
+      I64FullMult(e, f, g, h, a, b, c, d);
+    } else {
+      I64ABS(e, a, b, i, flagReg);
+      I64ABS(f, c, d, i, flagReg);
+      p->XOR(i, e, f);
+      I64FullMult(e, f, g, h, a, b, c, d);
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_NZ, i, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->NOT(e, e);
+      p->NOT(f, f);
+      p->NOT(g, g);
+      p->NOT(h, h);
+      p->MOV(i, GenRegister::immud(1));
+      addWithCarry(h, h, i);
+      addWithCarry(g, g, i);
+      addWithCarry(f, f, i);
+      p->ADD(e, e, i);
+      p->pop();
+    }
+    storeTopHalf(dest, e);
+    storeBottomHalf(dest, f);
+  }
+
+  void GenContext::emitI64MADSATInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister z = ra->genReg(insn.src(2));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GenRegister f = ra->genReg(insn.dst(6));
+    GenRegister g = ra->genReg(insn.dst(7));
+    GenRegister h = ra->genReg(insn.dst(8));
+    GenRegister i = ra->genReg(insn.dst(9));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+    GenRegister zero = GenRegister::immud(0), one = GenRegister::immud(1);
+    loadTopHalf(a, x);
+    loadBottomHalf(b, x);
+    loadTopHalf(c, y);
+    loadBottomHalf(d, y);
+    if(x.type == GEN_TYPE_UL) {
+      I64FullMult(e, f, g, h, a, b, c, d);
+      loadTopHalf(c, z);
+      loadBottomHalf(d, z);
+      addWithCarry(h, h, d);
+      addWithCarry(g, g, d);
+      addWithCarry(f, f, d);
+      p->ADD(e, e, d);
+      addWithCarry(g, g, c);
+      addWithCarry(f, f, c);
+      p->ADD(e, e, c);
+      p->OR(a, e, f);
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_NZ, a, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(g, GenRegister::immd(-1));
+      p->MOV(h, GenRegister::immd(-1));
+      p->pop();
+    } else {
+      I64ABS(e, a, b, i, flagReg);
+      I64ABS(f, c, d, i, flagReg);
+      p->XOR(i, e, f);
+      I64FullMult(e, f, g, h, a, b, c, d);
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_NZ, i, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->NOT(e, e);
+      p->NOT(f, f);
+      p->NOT(g, g);
+      p->NOT(h, h);
+      p->MOV(i, one);
+      addWithCarry(h, h, i);
+      addWithCarry(g, g, i);
+      addWithCarry(f, f, i);
+      p->ADD(e, e, i);
+      p->pop();
+      loadTopHalf(c, z);
+      loadBottomHalf(d, z);
+      p->ASR(GenRegister::retype(b, GEN_TYPE_D), GenRegister::retype(c, GEN_TYPE_D), GenRegister::immd(31));
+      p->MOV(a, b);
+      addWithCarry(h, h, d);
+      addWithCarry(g, g, d);
+      addWithCarry(f, f, d);
+      p->ADD(e, e, d);
+      addWithCarry(g, g, c);
+      addWithCarry(f, f, c);
+      p->ADD(e, e, c);
+      addWithCarry(f, f, b);
+      p->ADD(e, e, b);
+      p->ADD(e, e, a);
+      p->MOV(b, zero);
+      p->push();
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->CMP(GEN_CONDITIONAL_NZ, e, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_NZ, f, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_G, g, GenRegister::immud(0x7FFFFFFF));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->SHR(a, e, GenRegister::immud(31));
+      p->CMP(GEN_CONDITIONAL_NZ, a, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, zero);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_NZ, b, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(g, GenRegister::immud(0x7FFFFFFF));
+      p->MOV(h, GenRegister::immud(0xFFFFFFFFu));
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->MOV(b, zero);
+      p->CMP(GEN_CONDITIONAL_NEQ, e, GenRegister::immud(0xFFFFFFFFu));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_NEQ, f, GenRegister::immud(0xFFFFFFFFu));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_LE, g, GenRegister::immud(0x7FFFFFFF));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_Z, a, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(b, zero);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_NZ, b, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(g, GenRegister::immud(0x80000000u));
+      p->MOV(h, zero);
+      p->pop();
+    }
+    storeTopHalf(dest, g);
+    storeBottomHalf(dest, h);
+  }
+
+  void GenContext::emitI64HADDInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    a.type = b.type = c.type = d.type = GEN_TYPE_UD;
+    loadBottomHalf(a, x);
+    loadBottomHalf(b, y);
+    loadTopHalf(c, x);
+    loadTopHalf(d, y);
+    addWithCarry(a, a, b);
+    addWithCarry(c, c, b);
+    addWithCarry(c, c, d);
+    p->ADD(b, b, d);
+    p->SHR(a, a, GenRegister::immud(1));
+    p->SHL(d, c, GenRegister::immud(31));
+    p->OR(a, a, d);
+    p->SHR(c, c, GenRegister::immud(1));
+    p->SHL(d, b, GenRegister::immud(31));
+    p->OR(c, c, d);
+    storeBottomHalf(dest, a);
+    storeTopHalf(dest, c);
+  }
+
+  void GenContext::emitI64RHADDInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    a.type = b.type = c.type = d.type = GEN_TYPE_UD;
+    loadBottomHalf(a, x);
+    loadBottomHalf(b, y);
+    addWithCarry(a, a, b);
+    p->MOV(c, GenRegister::immud(1));
+    addWithCarry(a, a, c);
+    p->ADD(b, b, c);
+    loadTopHalf(c, x);
+    loadTopHalf(d, y);
+    addWithCarry(c, c, b);
+    addWithCarry(c, c, d);
+    p->ADD(b, b, d);
+    p->SHR(a, a, GenRegister::immud(1));
+    p->SHL(d, c, GenRegister::immud(31));
+    p->OR(a, a, d);
+    p->SHR(c, c, GenRegister::immud(1));
+    p->SHL(d, b, GenRegister::immud(31));
+    p->OR(c, c, d);
+    storeBottomHalf(dest, a);
+    storeTopHalf(dest, c);
+  }
+
+  void GenContext::emitI64ShiftInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GenRegister f = ra->genReg(insn.dst(6));
+    a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+    GenRegister zero = GenRegister::immud(0);
+    switch(insn.opcode) {
+      case SEL_OP_I64SHL:
+        p->push();
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        collectShifter(a, y);
+        loadBottomHalf(e, x);
+        loadTopHalf(f, x);
+        p->SHR(b, e, GenRegister::negate(a));
+        p->SHL(c, e, a);
+        p->SHL(d, f, a);
+        p->OR(e, d, b);
+        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+        p->CMP(GEN_CONDITIONAL_Z, a, zero);
+        p->SEL(d, d, e);
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->AND(a, a, GenRegister::immud(32));
+        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+        p->CMP(GEN_CONDITIONAL_Z, a, zero);
+        p->SEL(d, d, c);
+        p->SEL(c, c, zero);
+        p->pop();
+        storeBottomHalf(dest, c);
+        storeTopHalf(dest, d);
+        break;
+      case SEL_OP_I64SHR:
+        p->push();
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        collectShifter(a, y);
+        loadBottomHalf(e, x);
+        loadTopHalf(f, x);
+        p->SHL(b, f, GenRegister::negate(a));
+        p->SHR(c, f, a);
+        p->SHR(d, e, a);
+        p->OR(e, d, b);
+        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+        p->CMP(GEN_CONDITIONAL_Z, a, zero);
+        p->SEL(d, d, e);
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->AND(a, a, GenRegister::immud(32));
+        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+        p->CMP(GEN_CONDITIONAL_Z, a, zero);
+        p->SEL(d, d, c);
+        p->SEL(c, c, zero);
+        p->pop();
+        storeBottomHalf(dest, d);
+        storeTopHalf(dest, c);
+        break;
+      case SEL_OP_I64ASR:
+        f.type = GEN_TYPE_D;
+        p->push();
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->curr.noMask = 1;
+        collectShifter(a, y);
+        loadBottomHalf(e, x);
+        loadTopHalf(f, x);
+        p->SHL(b, f, GenRegister::negate(a));
+        p->ASR(c, f, a);
+        p->SHR(d, e, a);
+        p->OR(e, d, b);
+        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+        p->CMP(GEN_CONDITIONAL_Z, a, zero);
+        p->SEL(d, d, e);
+        p->curr.predicate = GEN_PREDICATE_NONE;
+        p->AND(a, a, GenRegister::immud(32));
+        p->ASR(f, f, GenRegister::immd(31));
+        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+        p->curr.predicate = GEN_PREDICATE_NORMAL;
+        p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+        p->CMP(GEN_CONDITIONAL_Z, a, zero);
+        p->SEL(d, d, c);
+        p->SEL(c, c, f);
+        p->pop();
+        storeBottomHalf(dest, d);
+        storeTopHalf(dest, c);
+        break;
+      default:
+        NOT_IMPLEMENTED;
+    }
+  }
+
+  void GenContext::saveFlag(GenRegister dest, int flag, int subFlag) {
+    p->push();
+    p->curr.execWidth = 1;
+    p->MOV(dest, GenRegister::flag(flag, subFlag));
+    p->pop();
+  }
+
+  void GenContext::UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister exp,
+                                            GenRegister mantissa, GenRegister tmp, GenRegister flag) {
+    uint32_t jip0, jip1;
+    GenRegister dst_ud = GenRegister::retype(dst, GEN_TYPE_UD);
+    p->push();
+      p->curr.noMask = 1;
+      p->MOV(exp, GenRegister::immud(32)); // make sure the inactive lane is 1 when check ALL8H/ALL16H condition latter.
+    p->pop();
+    p->FBH(exp, high);
+    p->ADD(exp, GenRegister::negate(exp), GenRegister::immud(31));  //exp = 32 when high == 0
+    p->push();
+      p->curr.useFlag(flag.flag_nr(), flag.flag_subnr());
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(32));   //high == 0
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->curr.noMask = 0;
+      p->MOV(dst, low);
+      p->push();
+        if (simdWidth == 8)
+          p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+        else if (simdWidth == 16)
+          p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+        else
+          NOT_IMPLEMENTED;
+        p->curr.execWidth = 1;
+        p->curr.noMask = 1;
+        jip0 = p->n_instruction();
+        p->JMPI(GenRegister::immud(0));
+      p->pop();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->CMP(GEN_CONDITIONAL_G, exp, GenRegister::immud(23));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(32));  //exp>23 && high!=0
+      p->ADD(tmp, exp, GenRegister::immud(-23));
+      p->SHR(mantissa, high, tmp);
+      p->AND(mantissa, mantissa, GenRegister::immud(0x7fffff));
+      p->SHR(dst_ud, low, tmp);   //dst is temp regitster here
+      p->ADD(tmp, GenRegister::negate(tmp), GenRegister::immud(32));
+      p->SHL(high, high, tmp);
+      p->OR(high, high, dst_ud);
+      p->SHL(low, low, tmp);
+      p->push();
+        if (simdWidth == 8)
+          p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+        else if (simdWidth == 16)
+          p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+        else
+          NOT_IMPLEMENTED;
+        p->curr.execWidth = 1;
+        p->curr.noMask = 1;
+        jip1 = p->n_instruction();
+        p->JMPI(GenRegister::immud(0));
+      p->pop();
+
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->CMP(GEN_CONDITIONAL_EQ, exp, GenRegister::immud(23));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(dst_ud, GenRegister::immud(0));   //exp==9, SHR == 0
+
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_L, exp, GenRegister::immud(23));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->ADD(tmp, exp, GenRegister::immud(9));
+      p->SHR(dst_ud, low, tmp);   //dst is temp regitster here
+
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(23));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->ADD(tmp, GenRegister::negate(exp), GenRegister::immud(23));
+      p->SHL(mantissa, high, tmp);
+      p->OR(mantissa, mantissa, dst_ud);
+      p->AND(mantissa, mantissa, GenRegister::immud(0x7fffff));
+      p->SHL(high, low, tmp);
+      p->MOV(low, GenRegister::immud(0));
+
+      p->patchJMPI(jip1, (p->n_instruction() - jip1) );
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(31));  //update dst where high != 0
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->ADD(exp, exp, GenRegister::immud(159));
+      p->SHL(exp, exp, GenRegister::immud(23));
+      p->OR(dst_ud, exp, mantissa);
+
+      p->CMP(GEN_CONDITIONAL_GE, high, GenRegister::immud(0x80000000));
+      p->ADD(dst_ud, dst_ud, GenRegister::immud(1));
+
+      p->CMP(GEN_CONDITIONAL_EQ, high, GenRegister::immud(0x80000000));
+      p->CMP(GEN_CONDITIONAL_EQ, low, GenRegister::immud(0x0));
+      p->AND(dst_ud, dst_ud, GenRegister::immud(0xfffffffe));
+      p->patchJMPI(jip0, (p->n_instruction() - jip0));
+
+    p->pop();
+
+  }
+
+  void GenContext::emitI64ToFloatInstruction(const SelectionInstruction &insn) {
+    GenRegister src = ra->genReg(insn.src(0));
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister high = ra->genReg(insn.dst(1));
+    GenRegister low = ra->genReg(insn.dst(2));
+    GenRegister exp = ra->genReg(insn.dst(3));
+    GenRegister mantissa = ra->genReg(insn.dst(4));
+    GenRegister tmp = ra->genReg(insn.dst(5));
+    GenRegister tmp_high = ra->genReg(insn.dst(6));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+    loadTopHalf(high, src);
+    loadBottomHalf(low, src);
+    if(!src.is_signed_int()) {
+      UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, flagReg);
+    } else {
+      p->MOV(tmp_high, high);
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->NOT(high, high);
+      p->NOT(low, low);
+      p->MOV(tmp, GenRegister::immud(1));
+      addWithCarry(low, low, tmp);
+      p->ADD(high, high, tmp);
+      p->pop();
+      UnsignedI64ToFloat(dest, high, low, exp, mantissa, tmp, flagReg);
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_GE, tmp_high, GenRegister::immud(0x80000000));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      dest.type = GEN_TYPE_UD;
+      p->OR(dest, dest, GenRegister::immud(0x80000000));
+      p->pop();
+    }
+  }
+
+
+  void GenContext::emitFloatToI64Instruction(const SelectionInstruction &insn) {
+    GenRegister src = ra->genReg(insn.src(0));
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister high = ra->genReg(insn.dst(1));
+    GenRegister tmp = ra->genReg(insn.dst(2));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+
+    if(dst.is_signed_int())
+      high = GenRegister::retype(high, GEN_TYPE_D);
+    GenRegister low = GenRegister::retype(tmp, GEN_TYPE_UD);
+    float c = (1.f / 65536.f) * (1.f / 65536.f);
+    p->MUL(tmp, src, GenRegister::immf(c));
+    p->RNDZ(tmp, tmp);
+    p->MOV(high, tmp);
+    c = 65536.f * 65536.f;
+    p->MOV(tmp, high);  //result may not equal to tmp
+    //mov float to int/uint is sat, so must sub high*0xffffffff
+    p->MUL(tmp, tmp, GenRegister::immf(c));
+    p->ADD(tmp, src, GenRegister::negate(tmp));
+    p->MOV(low, GenRegister::abs(tmp));
+    if(dst.is_signed_int()) {
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_L, src, GenRegister::immf(0x0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_NEQ, low, GenRegister::immud(0x0));
+      p->ADD(high, high, GenRegister::immd(-1));
+      p->NOT(low, low);
+      p->ADD(low, low, GenRegister::immud(1));
+      p->pop();
+    }
+    storeTopHalf(dst, high);
+    storeBottomHalf(dst, low);
+  }
+
+  void GenContext::emitI64CompareInstruction(const SelectionInstruction &insn) {
+    GenRegister src0 = ra->genReg(insn.src(0));
+    GenRegister src1 = ra->genReg(insn.src(1));
+    GenRegister tmp0 = ra->genReg(insn.dst(0));
+    GenRegister tmp1 = ra->genReg(insn.dst(1));
+    GenRegister tmp2 = ra->genReg(insn.dst(2));
+    tmp0.type = (src0.type == GEN_TYPE_L) ? GEN_TYPE_D : GEN_TYPE_UD;
+    tmp1.type = (src1.type == GEN_TYPE_L) ? GEN_TYPE_D : GEN_TYPE_UD;
+    int flag = p->curr.flag, subFlag = p->curr.subFlag;
+    GenRegister f1 = GenRegister::retype(tmp2, GEN_TYPE_UW);
+                f1.width = GEN_WIDTH_1;
+    GenRegister f2 = GenRegister::suboffset(f1, 1);
+    GenRegister f3 = GenRegister::suboffset(f1, 2);
+
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    loadTopHalf(tmp0, src0);
+    loadTopHalf(tmp1, src1);
+    switch(insn.extra.function) {
+      case GEN_CONDITIONAL_L:
+      case GEN_CONDITIONAL_LE:
+      case GEN_CONDITIONAL_G:
+      case GEN_CONDITIONAL_GE:
+        {
+          int cmpTopHalf = insn.extra.function;
+          if(insn.extra.function == GEN_CONDITIONAL_LE)
+            cmpTopHalf = GEN_CONDITIONAL_L;
+          if(insn.extra.function == GEN_CONDITIONAL_GE)
+            cmpTopHalf = GEN_CONDITIONAL_G;
+          p->CMP(cmpTopHalf, tmp0, tmp1);
+        }
+        saveFlag(f1, flag, subFlag);
+        p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
+        saveFlag(f2, flag, subFlag);
+        tmp0.type = tmp1.type = GEN_TYPE_UD;
+        loadBottomHalf(tmp0, src0);
+        loadBottomHalf(tmp1, src1);
+        p->CMP(insn.extra.function, tmp0, tmp1);
+        saveFlag(f3, flag, subFlag);
+        p->push();
+        p->curr.execWidth = 1;
+        p->AND(f2, f2, f3);
+        p->OR(f1, f1, f2);
+        p->pop();
+        break;
+      case GEN_CONDITIONAL_EQ:
+        p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
+        saveFlag(f1, flag, subFlag);
+        tmp0.type = tmp1.type = GEN_TYPE_UD;
+        loadBottomHalf(tmp0, src0);
+        loadBottomHalf(tmp1, src1);
+        p->CMP(GEN_CONDITIONAL_EQ, tmp0, tmp1);
+        saveFlag(f2, flag, subFlag);
+        p->push();
+        p->curr.execWidth = 1;
+        p->AND(f1, f1, f2);
+        p->pop();
+        break;
+      case GEN_CONDITIONAL_NEQ:
+        p->CMP(GEN_CONDITIONAL_NEQ, tmp0, tmp1);
+        saveFlag(f1, flag, subFlag);
+        tmp0.type = tmp1.type = GEN_TYPE_UD;
+        loadBottomHalf(tmp0, src0);
+        loadBottomHalf(tmp1, src1);
+        p->CMP(GEN_CONDITIONAL_NEQ, tmp0, tmp1);
+        saveFlag(f2, flag, subFlag);
+        p->push();
+        p->curr.execWidth = 1;
+        p->OR(f1, f1, f2);
+        p->pop();
+        break;
+      default:
+        NOT_IMPLEMENTED;
+    }
+    p->curr.execWidth = 1;
+    p->MOV(GenRegister::flag(flag, subFlag), f1);
+    p->pop();
+  }
+
+  void GenContext::emitI64SATADDInstruction(const SelectionInstruction &insn) {
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+    loadTopHalf(a, x);
+    loadBottomHalf(b, x);
+    loadTopHalf(c, y);
+    loadBottomHalf(d, y);
+    if(dst.is_signed_int())
+      p->SHR(e, a, GenRegister::immud(31));
+    addWithCarry(b, b, d);
+    addWithCarry(a, a, d);
+    addWithCarry(a, a, c);
+    p->ADD(c, c, d);
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+    if(! dst.is_signed_int()) {
+      p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(a, GenRegister::immud(0xFFFFFFFFu));
+      p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
+    } else {
+      p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(1));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_L, a, GenRegister::immud(0x80000000u));
+      p->MOV(a, GenRegister::immud(0x80000000u));
+      p->MOV(b, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
+      p->MOV(a, GenRegister::immud(0x7FFFFFFFu));
+      p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
+    }
+    p->pop();
+    storeTopHalf(dst, a);
+    storeBottomHalf(dst, b);
+  }
+
+  void GenContext::emitI64SATSUBInstruction(const SelectionInstruction &insn) {
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister dst = ra->genReg(insn.dst(0));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+    loadTopHalf(a, x);
+    loadBottomHalf(b, x);
+    loadTopHalf(c, y);
+    loadBottomHalf(d, y);
+    if(dst.is_signed_int())
+      p->SHR(e, a, GenRegister::immud(31));
+    subWithBorrow(b, b, d);
+    subWithBorrow(a, a, d);
+    subWithBorrow(a, a, c);
+    p->ADD(c, c, d);
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+    if(! dst.is_signed_int()) {
+      p->CMP(GEN_CONDITIONAL_NZ, c, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(a, GenRegister::immud(0));
+      p->MOV(b, GenRegister::immud(0));
+    } else {
+      p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(1));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_L, a, GenRegister::immud(0x80000000u));
+      p->MOV(a, GenRegister::immud(0x80000000u));
+      p->MOV(b, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_EQ, e, GenRegister::immud(0));
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_GE, a, GenRegister::immud(0x80000000u));
+      p->MOV(a, GenRegister::immud(0x7FFFFFFFu));
+      p->MOV(b, GenRegister::immud(0xFFFFFFFFu));
+    }
+    p->pop();
+    storeTopHalf(dst, a);
+    storeBottomHalf(dst, b);
+  }
+
+  void GenContext::loadTopHalf(GenRegister dest, GenRegister src) {
+    p->MOV(dest, src.top_half(this->simdWidth));
+  }
+
+  void GenContext::storeTopHalf(GenRegister dest, GenRegister src) {
+    p->MOV(dest.top_half(this->simdWidth), src);
+  }
+
+  void GenContext::loadBottomHalf(GenRegister dest, GenRegister src) {
+    p->MOV(dest, src.bottom_half());
+  }
+
+  void GenContext::storeBottomHalf(GenRegister dest, GenRegister src) {
+    p->MOV(dest.bottom_half(), src);
+  }
+
+  void GenContext::addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1) {
+    int execWidth = p->curr.execWidth;
+    GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
+    p->push();
+    p->curr.execWidth = 8;
+    p->ADDC(dest, src0, src1);
+    p->MOV(src1, acc0);
+    if (execWidth == 16) {
+      p->curr.quarterControl = 1;
+      p->ADDC(GenRegister::suboffset(dest, 8),
+              GenRegister::suboffset(src0, 8),
+              GenRegister::suboffset(src1, 8));
+      p->MOV(GenRegister::suboffset(src1, 8), acc0);
+    }
+    p->pop();
+  }
+
+  void GenContext::subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1) {
+    int execWidth = p->curr.execWidth;
+    GenRegister acc0 = GenRegister::retype(GenRegister::acc(), GEN_TYPE_D);
+    p->push();
+    p->curr.execWidth = 8;
+    p->SUBB(dest, src0, src1);
+    p->MOV(src1, acc0);
+    if (execWidth == 16) {
+      p->curr.quarterControl = 1;
+      p->SUBB(GenRegister::suboffset(dest, 8),
+              GenRegister::suboffset(src0, 8),
+              GenRegister::suboffset(src1, 8));
+      p->MOV(GenRegister::suboffset(src1, 8), acc0);
+    }
+    p->pop();
+  }
+
+  void GenContext::I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1) {
+    GenRegister acc = GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD);
+    int execWidth = p->curr.execWidth;
+    p->push();
+    p->curr.execWidth = 8;
+    for(int i = 0; i < execWidth; i += 8) {
+      p->MUL(acc, src0, src1);
+      p->curr.accWrEnable = 1;
+      p->MACH(high, src0, src1);
+      p->curr.accWrEnable = 0;
+      p->MOV(low, acc);
+      src0 = GenRegister::suboffset(src0, 8);
+      src1 = GenRegister::suboffset(src1, 8);
+      high = GenRegister::suboffset(high, 8);
+      low = GenRegister::suboffset(low, 8);
+    }
+    p->pop();
+  }
+
+  void GenContext::emitI64MULInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GenRegister f = ra->genReg(insn.dst(6));
+    a.type = b.type = c.type = d.type = e.type = f.type = GEN_TYPE_UD;
+    loadTopHalf(a, x);
+    loadBottomHalf(b, x);
+    loadTopHalf(c, y);
+    loadBottomHalf(d, y);
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), e, b, c);
+    I32FullMult(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), f, a, d);
+    p->ADD(e, e, f);
+    I32FullMult(f, a, b, d);
+    p->ADD(e, e, f);
+    p->pop();
+    storeTopHalf(dest, e);
+    storeBottomHalf(dest, a);
+  }
+
+  void GenContext::emitI64DIVREMInstruction(const SelectionInstruction &insn) {
+    GenRegister dest = ra->genReg(insn.dst(0));
+    GenRegister x = ra->genReg(insn.src(0));
+    GenRegister y = ra->genReg(insn.src(1));
+    GenRegister a = ra->genReg(insn.dst(1));
+    GenRegister b = ra->genReg(insn.dst(2));
+    GenRegister c = ra->genReg(insn.dst(3));
+    GenRegister d = ra->genReg(insn.dst(4));
+    GenRegister e = ra->genReg(insn.dst(5));
+    GenRegister f = ra->genReg(insn.dst(6));
+    GenRegister g = ra->genReg(insn.dst(7));
+    GenRegister h = ra->genReg(insn.dst(8));
+    GenRegister i = ra->genReg(insn.dst(9));
+    GenRegister j = ra->genReg(insn.dst(10));
+    GenRegister k = ra->genReg(insn.dst(11));
+    GenRegister l = ra->genReg(insn.dst(12));
+    GenRegister m = ra->genReg(insn.dst(13));
+    GBE_ASSERT(insn.state.flag == 0 && insn.state.subFlag == 1);
+    GenRegister flagReg = GenRegister::flag(insn.state.flag, insn.state.subFlag);
+    GenRegister zero = GenRegister::immud(0),
+                one = GenRegister::immud(1),
+                imm31 = GenRegister::immud(31);
+    uint32_t jip0;
+    // (a,b) <- x
+    loadTopHalf(a, x);
+    loadBottomHalf(b, x);
+    // (c,d) <- y
+    loadTopHalf(c, y);
+    loadBottomHalf(d, y);
+    // k <- sign_of_result
+    if(x.is_signed_int()) {
+      GBE_ASSERT(y.is_signed_int());
+      GBE_ASSERT(dest.is_signed_int());
+      I64ABS(k, a, b, e, flagReg);
+      I64ABS(l, c, d, e, flagReg);
+      if(insn.opcode == SEL_OP_I64DIV)
+        p->XOR(k, k, l);
+    }
+    // (e,f) <- 0
+    p->MOV(e, zero);
+    p->MOV(f, zero);
+    // (g,h) <- 2**63
+    p->MOV(g, GenRegister::immud(0x80000000));
+    p->MOV(h, zero);
+    // (i,j) <- 0
+    p->MOV(i, zero);
+    p->MOV(j, zero);
+    // m <- 0
+    p->MOV(m, zero);
+    {
+      uint32_t loop_start = p->n_instruction();
+      // (c,d,e,f) <- (c,d,e,f) / 2
+      p->SHR(f, f, one);
+      p->SHL(l, e, imm31);
+      p->OR(f, f, l);
+      p->SHR(e, e, one);
+      p->SHL(l, d, imm31);
+      p->OR(e, e, l);
+      p->SHR(d, d, one);
+      p->SHL(l, c, imm31);
+      p->OR(d, d, l);
+      p->SHR(c, c, one);
+      // condition <- (c,d)==0 && (a,b)>=(e,f)
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->MOV(l, zero);
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_EQ, a, e);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_GE, b, f);
+      p->MOV(l, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_G, a, e);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->MOV(l, one);
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->CMP(GEN_CONDITIONAL_NEQ, l, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      p->CMP(GEN_CONDITIONAL_EQ, c, zero);
+      p->CMP(GEN_CONDITIONAL_EQ, d, zero);
+      // under condition, (a,b) <- (a,b) - (e,f)
+      p->MOV(l, f);
+      subWithBorrow(b, b, l);
+      subWithBorrow(a, a, l);
+      p->MOV(l, e);
+      subWithBorrow(a, a, l);
+      // under condition, (i,j) <- (i,j) | (g,h)
+      p->OR(i, i, g);
+      p->OR(j, j, h);
+      p->pop();
+      // (g,h) /= 2
+      p->SHR(h, h, one);
+      p->SHL(l, g, imm31);
+      p->OR(h, h, l);
+      p->SHR(g, g, one);
+      // condition: m < 64
+      p->ADD(m, m, one);
+
+      p->push();
+      p->curr.noMask = 1;
+      p->curr.execWidth = 1;
+      p->MOV(flagReg, zero);
+      p->pop();
+
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 0;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_L, m, GenRegister::immud(64));
+
+      p->curr.execWidth = 1;
+      p->curr.noMask = 1;
+      // under condition, jump back to start point
+      if (simdWidth == 8)
+        p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+      else if (simdWidth == 16)
+        p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+      else
+        NOT_IMPLEMENTED;
+      int distance = -(int)(p->n_instruction() - loop_start );
+      p->curr.noMask = 1;
+      jip0 = p->n_instruction();
+      p->JMPI(zero);
+      p->patchJMPI(jip0, distance);
+      p->pop();
+      // end of loop
+    }
+    // adjust sign of result
+    if(x.is_signed_int()) {
+      p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
+      p->CMP(GEN_CONDITIONAL_NEQ, k, zero);
+      p->curr.predicate = GEN_PREDICATE_NORMAL;
+      if(insn.opcode == SEL_OP_I64DIV)
+        I64Neg(i, j, l);
+      else
+        I64Neg(a, b, l);
+      p->pop();
+    }
+    // write dest
+    if(insn.opcode == SEL_OP_I64DIV) {
+      storeTopHalf(dest, i);
+      storeBottomHalf(dest, j);
+    } else {
+      GBE_ASSERT(insn.opcode == SEL_OP_I64REM);
+      storeTopHalf(dest, a);
+      storeBottomHalf(dest, b);
+    }
+  }
+
+  void GenContext::emitTernaryInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src0 = ra->genReg(insn.src(0));
+    const GenRegister src1 = ra->genReg(insn.src(1));
+    const GenRegister src2 = ra->genReg(insn.src(2));
+    switch (insn.opcode) {
+      case SEL_OP_MAD:  p->MAD(dst, src0, src1, src2); break;
+      default: NOT_IMPLEMENTED;
+    }
+  }
+
+  void GenContext::emitNoOpInstruction(const SelectionInstruction &insn) {
+   p->NOP();
+  }
+
+  void GenContext::emitWaitInstruction(const SelectionInstruction &insn) {
+    p->WAIT();
+  }
+
+  void GenContext::emitBarrierInstruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister fenceDst = ra->genReg(insn.dst(0));
+    uint32_t barrierType = insn.extra.barrierType;
+    const GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid));
+
+    if (barrierType == ir::syncGlobalBarrier) {
+      p->FENCE(fenceDst);
+      p->MOV(fenceDst, fenceDst);
+    }
+    p->push();
+      // As only the payload.2 is used and all the other regions are ignored
+      // SIMD8 mode here is safe.
+      p->curr.execWidth = 8;
+      p->curr.physicalFlag = 0;
+      p->curr.noMask = 1;
+      // Copy barrier id from r0.
+      p->AND(src, barrierId, GenRegister::immud(0x0f000000));
+      // A barrier is OK to start the thread synchronization *and* SLM fence
+      p->BARRIER(src);
+      p->curr.execWidth = 1;
+      // Now we wait for the other threads
+      p->WAIT();
+    p->pop();
+  }
+
+  void GenContext::emitFenceInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    p->FENCE(dst);
+    p->MOV(dst, dst);
+  }
+
+  void GenContext::emitMathInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src0 = ra->genReg(insn.src(0));
+    const uint32_t function = insn.extra.function;
+    if (insn.srcNum == 2) {
+      const GenRegister src1 = ra->genReg(insn.src(1));
+      p->MATH(dst, function, src0, src1);
+    } else
+      p->MATH(dst, function, src0);
+  }
+
+  void GenContext::emitCompareInstruction(const SelectionInstruction &insn) {
+    const GenRegister src0 = ra->genReg(insn.src(0));
+    const GenRegister src1 = ra->genReg(insn.src(1));
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    if (insn.opcode == SEL_OP_CMP)
+      p->CMP(insn.extra.function, src0, src1, dst);
+    else {
+      GBE_ASSERT(insn.opcode == SEL_OP_SEL_CMP);
+      const GenRegister dst = ra->genReg(insn.dst(0));
+      p->SEL_CMP(insn.extra.function, dst, src0, src1);
+    }
+  }
+
+  void GenContext::emitAtomicInstruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const uint32_t function = insn.extra.function;
+    const uint32_t bti = insn.getbti();
+
+    p->ATOMIC(dst, function, src, bti, insn.srcNum);
+  }
+
+  void GenContext::emitIndirectMoveInstruction(const SelectionInstruction &insn) {
+    GenRegister src = ra->genReg(insn.src(0));
+    if(sel->isScalarReg(src.reg()))
+      src = GenRegister::retype(src, GEN_TYPE_UW);
+    else
+      src = GenRegister::unpacked_uw(src.nr, src.subnr / typeSize(GEN_TYPE_UW));
+
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister a0 = GenRegister::addr8(0);
+    uint32_t simdWidth = p->curr.execWidth;
+
+    p->push();
+      p->curr.execWidth = 8;
+      p->curr.quarterControl = GEN_COMPRESSION_Q1;
+      p->MOV(a0, src);
+      p->MOV(dst, GenRegister::indirect(dst.type, 0, GEN_WIDTH_8));
+    p->pop();
+
+    if (simdWidth == 16) {
+      p->push();
+        p->curr.execWidth = 8;
+        p->curr.quarterControl = GEN_COMPRESSION_Q2;
+
+        const GenRegister nextDst = GenRegister::Qn(dst, 1);
+        const GenRegister nextSrc = GenRegister::Qn(src, 1);
+        p->MOV(a0, nextSrc);
+        p->MOV(nextDst, GenRegister::indirect(dst.type, 0, GEN_WIDTH_8));
+      p->pop();
+    }
+  }
+
+ void GenContext::insertJumpPos(const SelectionInstruction &insn) {
+    const ir::LabelIndex label(insn.index);
+    this->branchPos2.push_back(std::make_pair(label, p->store.size()));
+ }
+
+  void GenContext::emitJumpInstruction(const SelectionInstruction &insn) {
+    insertJumpPos(insn);
+    const GenRegister src = ra->genReg(insn.src(0));
+    p->JMPI(src, insn.extra.longjmp);
+  }
+
+  void GenContext::emitEotInstruction(const SelectionInstruction &insn) {
+    p->push();
+      p->curr.predicate = GEN_PREDICATE_NONE;
+      p->curr.noMask = 1;
+      p->MOV(GenRegister::ud8grf(112, 0), GenRegister::ud8grf(0, 0));
+      p->curr.execWidth = 8;
+      p->EOT(112);
+    p->pop();
+  }
+
+  void GenContext::emitSpillRegInstruction(const SelectionInstruction &insn) {
+    uint32_t simdWidth = p->curr.execWidth;
+    uint32_t scratchOffset = insn.extra.scratchOffset;
+    const uint32_t header = insn.extra.scratchMsgHeader;
+    p->push();
+
+    const GenRegister msg = GenRegister::ud8grf(header, 0);
+    const GenRegister src = ra->genReg(insn.src(0));
+    GenRegister payload = src;
+    payload.nr = header + 1;
+    payload.subnr = 0;
+
+    GBE_ASSERT(src.subnr == 0);
+    uint32_t regType = insn.src(0).type;
+    uint32_t size = typeSize(regType);
+    uint32_t regSize = stride(src.hstride)*size;
+
+    GBE_ASSERT(regSize == 4 || regSize == 8);
+    if(regSize == 4) {
+      if (payload.nr != src.nr)
+        p->MOV(payload, src);
+      uint32_t regNum = (regSize*simdWidth) > 32 ? 2 : 1;
+      this->scratchWrite(msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+    }
+    else { //size == 8
+      payload.type = GEN_TYPE_UD;
+      GBE_ASSERT(payload.hstride == GEN_HORIZONTAL_STRIDE_1);
+      loadBottomHalf(payload, src);
+      uint32_t regNum = (regSize/2*simdWidth) > 32 ? 2 : 1;
+      this->scratchWrite(msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+      loadTopHalf(payload, src);
+      this->scratchWrite(msg, scratchOffset + 4*simdWidth, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+    }
+    p->pop();
+  }
+
+  void GenContext::emitUnSpillRegInstruction(const SelectionInstruction &insn) {
+    uint32_t scratchOffset = insn.extra.scratchOffset;
+    const GenRegister dst = insn.dst(0);
+    uint32_t regType = dst.type;
+    uint32_t simdWidth = p->curr.execWidth;
+    const uint32_t header = insn.extra.scratchMsgHeader;
+    uint32_t size = typeSize(regType);
+    uint32_t regSize = stride(dst.hstride)*size;
+
+    const GenRegister msg = GenRegister::ud8grf(header, 0);
+    GenRegister payload = msg;
+    payload.nr = header + 1;
+
+    p->push();
+    assert(regSize == 4 || regSize == 8);
+    if(regSize == 4) {
+      uint32_t regNum = (regSize*simdWidth) > 32 ? 2 : 1;
+      this->scratchRead(GenRegister::ud8grf(dst.nr, dst.subnr), msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+    } else {
+      uint32_t regNum = (regSize/2*simdWidth) > 32 ? 2 : 1;
+      this->scratchRead(payload, msg, scratchOffset, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+      storeBottomHalf(dst, payload);
+      this->scratchRead(payload, msg, scratchOffset + 4*simdWidth, regNum, GEN_TYPE_UD, GEN_SCRATCH_CHANNEL_MODE_DWORD);
+      storeTopHalf(dst, payload);
+    }
+    p->pop();
+  }
+
+  void GenContext::emitRead64Instruction(const SelectionInstruction &insn) {
+    const uint32_t elemNum = insn.extra.elem;
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.getbti();
+    p->UNTYPED_READ(dst, src, bti, elemNum*2);
+  }
+
+  void GenContext::emitUntypedReadInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.getbti();
+    const uint32_t elemNum = insn.extra.elem;
+    p->UNTYPED_READ(dst, src, bti, elemNum);
+  }
+
+  void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.dst(0));
+    const uint32_t elemNum = insn.extra.elem;
+    const uint32_t bti = insn.getbti();
+    p->UNTYPED_WRITE(src, bti, elemNum*2);
+  }
+
+  void GenContext::emitUntypedWriteInstruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.getbti();
+    const uint32_t elemNum = insn.extra.elem;
+    p->UNTYPED_WRITE(src, bti, elemNum);
+  }
+
+  void GenContext::emitByteGatherInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.getbti();
+    const uint32_t elemSize = insn.extra.elem;
+    p->BYTE_GATHER(dst, src, bti, elemSize);
+  }
+
+  void GenContext::emitByteScatterInstruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.getbti();
+    const uint32_t elemSize = insn.extra.elem;
+    p->BYTE_SCATTER(src, bti, elemSize);
+  }
+
+  void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) {
+    const GenRegister src = ra->genReg(insn.src(0));
+    for(uint32_t i = 0; i < insn.dstNum; i++) {
+      p->MOV(ra->genReg(insn.dst(i)), GenRegister::splitReg(src, insn.dstNum, i));
+    }
+  }
+
+  void GenContext::emitPackByteInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    p->push();
+    if(simdWidth == 8) {
+      for(uint32_t i = 0; i < insn.srcNum; i++)
+        p->MOV(GenRegister::splitReg(dst, insn.srcNum, i), ra->genReg(insn.src(i)));
+    } else {
+      // when destination expands two registers, the source must span two registers.
+      p->curr.execWidth = 8;
+      for(uint32_t i = 0; i < insn.srcNum; i++) {
+        GenRegister dsti = GenRegister::splitReg(dst, insn.srcNum, i);
+        GenRegister src = ra->genReg(insn.src(i));
+
+        p->curr.quarterControl = 0;
+        p->MOV(dsti, src);
+        p->curr.quarterControl = 1;
+        p->MOV(GenRegister::Qn(dsti,1), GenRegister::Qn(src, 1));
+      }
+    }
+    p->pop();
+  }
+
+  void GenContext::emitDWordGatherInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister src = ra->genReg(insn.src(0));
+    const uint32_t bti = insn.getbti();
+    p->DWORD_GATHER(dst, src, bti);
+  }
+
+  void GenContext::emitSampleInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const GenRegister msgPayload = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F);
+    const unsigned char bti = insn.getbti();
+    const unsigned char sampler = insn.extra.sampler;
+    const unsigned int msgLen = insn.extra.rdmsglen;
+    uint32_t simdWidth = p->curr.execWidth;
+    p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 0, insn.extra.isLD, insn.extra.isUniform);
+  }
+
+  void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
+    p->push();
+    uint32_t simdWidth = p->curr.execWidth;
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+
+    p->curr.execWidth = 8;
+    p->MOV(header, GenRegister::ud8grf(0,0));
+    p->pop();
+
+    int size = typeSize(reg_type)*simdWidth;
+    p->push();
+    p->SCRATCH_WRITE(header, offset/32, size, reg_num, channel_mode);
+    p->pop();
+  }
+
+  void GenContext::scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
+    p->push();
+    uint32_t simdWidth = p->curr.execWidth;
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.execWidth = 8;
+    p->MOV(header, GenRegister::ud8grf(0,0));
+    p->pop();
+
+    int size = typeSize(reg_type)*simdWidth;
+    p->push();
+    p->SCRATCH_READ(dst, header, offset/32, size, reg_num, channel_mode);
+    p->pop();
+  }
+
+  void GenContext::emitTypedWriteInstruction(const SelectionInstruction &insn) {
+    const GenRegister header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD);
+    const uint32_t bti = insn.getbti();
+    p->TYPED_WRITE(header, true, bti);
+  }
+
+  BVAR(OCL_OUTPUT_REG_ALLOC, false);
+  BVAR(OCL_OUTPUT_ASM, false);
+
+  void GenContext::allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue) {
+    uint32_t regSize;
+    regSize = this->ra->getRegSize(reg);
+    insertCurbeReg(reg, newCurbeEntry(value, subValue, regSize));
+  }
+
+  void GenContext::buildPatchList(void) {
+    const uint32_t ptrSize = unit.getPointerSize() == ir::POINTER_32_BITS ? 4u : 8u;
+    kernel->curbeSize = 0u;
+    auto &stackUse = dag->getUse(ir::ocl::stackptr);
+
+    // We insert the block IP mask first
+    using namespace ir::ocl;
+    allocCurbeReg(blockip, GBE_CURBE_BLOCK_IP);
+    allocCurbeReg(lid0, GBE_CURBE_LOCAL_ID_X);
+    allocCurbeReg(lid1, GBE_CURBE_LOCAL_ID_Y);
+    allocCurbeReg(lid2, GBE_CURBE_LOCAL_ID_Z);
+    allocCurbeReg(zero, GBE_CURBE_ZERO);
+    allocCurbeReg(one, GBE_CURBE_ONE);
+    if (stackUse.size() != 0)
+      allocCurbeReg(stackbuffer, GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER);
+    allocSLMOffsetCurbe();
+    // Go over the arguments and find the related patch locations
+    const uint32_t argNum = fn.argNum();
+    for (uint32_t argID = 0u; argID < argNum; ++argID) {
+      const ir::FunctionArgument &arg = fn.getArg(argID);
+      // For pointers and values, we have nothing to do. We just push the values
+      if (arg.type == ir::FunctionArgument::GLOBAL_POINTER ||
+          arg.type == ir::FunctionArgument::LOCAL_POINTER ||
+          arg.type == ir::FunctionArgument::CONSTANT_POINTER ||
+          arg.type == ir::FunctionArgument::VALUE ||
+          arg.type == ir::FunctionArgument::STRUCTURE ||
+          arg.type == ir::FunctionArgument::IMAGE ||
+          arg.type == ir::FunctionArgument::SAMPLER)
+        this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, ptrSize));
+    }
+
+    // Go over all the instructions and find the special register we need
+    // to push
+    #define INSERT_REG(SPECIAL_REG, PATCH) \
+    if (reg == ir::ocl::SPECIAL_REG) { \
+      if (curbeRegs.find(reg) != curbeRegs.end()) continue; \
+      allocCurbeReg(reg, GBE_CURBE_##PATCH); \
+    } else
+  
+    fn.foreachInstruction([&](ir::Instruction &insn) {
+      const uint32_t srcNum = insn.getSrcNum();
+      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+        const ir::Register reg = insn.getSrc(srcID);
+        if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) {
+          if (srcID != 0) continue;
+          const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex();
+          const unsigned char type =  ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();;
+          ir::ImageInfoKey key(bti, type);
+          const ir::Register imageInfo = insn.getSrc(0);
+          if (curbeRegs.find(imageInfo) == curbeRegs.end()) {
+            uint32_t offset = this->getImageInfoCurbeOffset(key, 4);
+            insertCurbeReg(imageInfo, offset);
+          }
+          continue;
+        }
+        if (fn.isSpecialReg(reg) == false) continue;
+        if (curbeRegs.find(reg) != curbeRegs.end()) continue;
+        if (reg == ir::ocl::stackptr) GBE_ASSERT(stackUse.size() > 0);
+        INSERT_REG(lsize0, LOCAL_SIZE_X)
+        INSERT_REG(lsize1, LOCAL_SIZE_Y)
+        INSERT_REG(lsize2, LOCAL_SIZE_Z)
+        INSERT_REG(gsize0, GLOBAL_SIZE_X)
+        INSERT_REG(gsize1, GLOBAL_SIZE_Y)
+        INSERT_REG(gsize2, GLOBAL_SIZE_Z)
+        INSERT_REG(goffset0, GLOBAL_OFFSET_X)
+        INSERT_REG(goffset1, GLOBAL_OFFSET_Y)
+        INSERT_REG(goffset2, GLOBAL_OFFSET_Z)
+        INSERT_REG(workdim, WORK_DIM)
+        INSERT_REG(numgroup0, GROUP_NUM_X)
+        INSERT_REG(numgroup1, GROUP_NUM_Y)
+        INSERT_REG(numgroup2, GROUP_NUM_Z)
+        INSERT_REG(stackptr, STACK_POINTER)
+        INSERT_REG(printfbptr, PRINTF_BUF_POINTER)
+        INSERT_REG(printfiptr, PRINTF_INDEX_POINTER)
+        do {} while(0);
+      }
+    });
+#undef INSERT_REG
+
+
+    // After this point the vector is immutable. Sorting it will make
+    // research faster
+    std::sort(kernel->patches.begin(), kernel->patches.end());
+
+    kernel->curbeSize = ALIGN(kernel->curbeSize, GEN_REG_SIZE);
+  }
+
+  bool GenContext::emitCode(void) {
+    GenKernel *genKernel = static_cast<GenKernel*>(this->kernel);
+    buildPatchList();
+    sel->select();
+    schedulePreRegAllocation(*this, *this->sel);
+    if (UNLIKELY(ra->allocate(*this->sel) == false))
+      return false;
+    schedulePostRegAllocation(*this, *this->sel);
+    if (OCL_OUTPUT_REG_ALLOC)
+      ra->outputAllocation();
+    this->clearFlagRegister();
+    this->emitStackPointer();
+    this->emitSLMOffset();
+    this->emitInstructionStream();
+    if (this->patchBranches() == false)
+      return false;
+    genKernel->insnNum = p->store.size();
+    genKernel->insns = GBE_NEW_ARRAY_NO_ARG(GenInstruction, genKernel->insnNum);
+    std::memcpy(genKernel->insns, &p->store[0], genKernel->insnNum * sizeof(GenInstruction));
+    if (OCL_OUTPUT_ASM) {
+      std::cout << genKernel->getName() << "'s disassemble begin:" << std::endl;
+      ir::LabelIndex curLabel = (ir::LabelIndex)0;
+      GenCompactInstruction * pCom = NULL;
+      GenNativeInstruction insn;
+      std::cout << "  L0:" << std::endl;
+      for (uint32_t insnID = 0; insnID < genKernel->insnNum; ) {
+        if (labelPos.find((ir::LabelIndex)(curLabel + 1))->second == insnID &&
+            curLabel < this->getFunction().labelNum()) {
+          std::cout << "  L" << curLabel + 1 << ":" << std::endl;
+          curLabel = (ir::LabelIndex)(curLabel + 1);
+        }
+        std::cout << "    (" << std::setw(8) << insnID << ")  ";
+        pCom = (GenCompactInstruction*)&p->store[insnID];
+        if(pCom->bits1.cmpt_control == 1) {
+          decompactInstruction(pCom, &insn);
+          gen_disasm(stdout, &insn, deviceID, 1);
+          insnID++;
+        } else {
+          gen_disasm(stdout, &p->store[insnID], deviceID, 0);
+          insnID = insnID + 2;
+        }
+      }
+      std::cout << genKernel->getName() << "'s disassemble end." << std::endl;
+    }
+    return true;
+  }
+
+  Kernel *GenContext::allocateKernel(void) {
+    return GBE_NEW(GenKernel, name, deviceID);
+  }
+
+} /* namespace gbe */
+
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
new file mode 100644
index 0000000..02c83d0
--- /dev/null
+++ b/backend/src/backend/gen_context.hpp
@@ -0,0 +1,224 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_context.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_GEN_CONTEXT_HPP__
+#define __GBE_GEN_CONTEXT_HPP__
+
+#include "backend/context.hpp"
+#include "backend/gen_encoder.hpp"
+#include "backend/program.h"
+#include "backend/gen_register.hpp"
+#include "ir/function.hpp"
+#include "ir/liveness.hpp"
+#include "sys/map.hpp"
+#include <string>
+
+namespace gbe
+{
+  class Kernel;               // We build this structure
+  class GenEncoder;           // Helps emitting Gen ISA
+  class GenRegAllocator;      // Handle the register allocation
+  class Selection;            // Performs instruction selection
+  class SelectionInstruction; // Pre-RA Gen instruction
+  class SelectionReg;         // Pre-RA Gen register
+  class GenRegister;
+  typedef enum {
+    NO_ERROR,
+    REGISTER_ALLOCATION_FAIL,
+    REGISTER_SPILL_EXCEED_THRESHOLD,
+    REGISTER_SPILL_FAIL,
+    OUT_OF_RANGE_IF_ENDIF,
+  } CompileErrorCode;
+
+  /*! Context is the helper structure to build the Gen ISA or simulation code
+   *  from GenIR
+   */
+  class GenContext : public Context
+  {
+  public:
+    /*! Create a new context. name is the name of the function we want to
+     *  compile
+     */
+    GenContext(const ir::Unit &unit, const std::string &name, uint32_t deviceID,
+               bool relaxMath = false);
+    /*! Release everything needed */
+    virtual ~GenContext(void);
+    /*! device's max srcatch buffer size */
+    #define GEN7_SCRATCH_SIZE  (12 * KB)
+    /*! Start new code generation with specific parameters */
+    void startNewCG(uint32_t simdWidth, uint32_t reservedSpillRegs, bool limitRegisterPressure);
+    /*! Target device ID*/
+    uint32_t deviceID;
+    /*! Implements base class */
+    virtual bool emitCode(void);
+    /*! Align the scratch size to the device's scratch unit size */
+    virtual uint32_t alignScratchSize(uint32_t size);
+    /*! Get the device's max srcatch size */
+    virtual uint32_t getScratchSize(void) { return GEN7_SCRATCH_SIZE; }
+    /*! Function we emit code for */
+    INLINE const ir::Function &getFunction(void) const { return fn; }
+    /*! Simd width chosen for the current function */
+    INLINE uint32_t getSimdWidth(void) const { return simdWidth; }
+    void clearFlagRegister(void);
+    /*! check the flag reg, if is grf, use f0.1 instead */
+    GenRegister checkFlagRegister(GenRegister flagReg);
+    /*! Emit the per-lane stack pointer computation */
+    virtual void emitStackPointer(void);
+    /*! Emit the instructions */
+    void emitInstructionStream(void);
+    /*! Set the correct target values for the branches */
+    bool patchBranches(void);
+    /*! Forward ir::Function isSpecialReg method */
+    INLINE bool isSpecialReg(ir::Register reg) const {
+      return fn.isSpecialReg(reg);
+    }
+    /*! Get the liveOut information for the given block */
+    INLINE const ir::Liveness::LiveOut &getLiveOut(const ir::BasicBlock *bb) const {
+      return this->liveness->getLiveOut(bb);
+    }
+    /*! Get the LiveIn information for the given block */
+    INLINE const ir::Liveness::UEVar &getLiveIn(const ir::BasicBlock *bb) const {
+      return this->liveness->getLiveIn(bb);
+    }
+
+    void collectShifter(GenRegister dest, GenRegister src);
+    void loadTopHalf(GenRegister dest, GenRegister src);
+    void storeTopHalf(GenRegister dest, GenRegister src);
+
+    void loadBottomHalf(GenRegister dest, GenRegister src);
+    void storeBottomHalf(GenRegister dest, GenRegister src);
+
+    void addWithCarry(GenRegister dest, GenRegister src0, GenRegister src1);
+    void subWithBorrow(GenRegister dest, GenRegister src0, GenRegister src1);
+    void I64Neg(GenRegister high, GenRegister low, GenRegister tmp);
+    void I64ABS(GenRegister sign, GenRegister high, GenRegister low, GenRegister tmp, GenRegister flagReg);
+    void I64FullAdd(GenRegister high1, GenRegister low1, GenRegister high2, GenRegister low2);
+    void I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1);
+    void I64FullMult(GenRegister dst1, GenRegister dst2, GenRegister dst3, GenRegister dst4, GenRegister x_high, GenRegister x_low, GenRegister y_high, GenRegister y_low);
+    void saveFlag(GenRegister dest, int flag, int subFlag);
+    void UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister exp, GenRegister mantissa, GenRegister tmp, GenRegister flag);
+
+    /*! Final Gen ISA emission helper functions */
+    void emitLabelInstruction(const SelectionInstruction &insn);
+    void emitUnaryInstruction(const SelectionInstruction &insn);
+    void emitUnaryWithTempInstruction(const SelectionInstruction &insn);
+    void emitBinaryInstruction(const SelectionInstruction &insn);
+    void emitBinaryWithTempInstruction(const SelectionInstruction &insn);
+    void emitTernaryInstruction(const SelectionInstruction &insn);
+    void emitI64MULHIInstruction(const SelectionInstruction &insn);
+    void emitI64MADSATInstruction(const SelectionInstruction &insn);
+    void emitI64HADDInstruction(const SelectionInstruction &insn);
+    void emitI64RHADDInstruction(const SelectionInstruction &insn);
+    void emitI64ShiftInstruction(const SelectionInstruction &insn);
+    void emitI64CompareInstruction(const SelectionInstruction &insn);
+    void emitI64SATADDInstruction(const SelectionInstruction &insn);
+    void emitI64SATSUBInstruction(const SelectionInstruction &insn);
+    void emitI64ToFloatInstruction(const SelectionInstruction &insn);
+    void emitFloatToI64Instruction(const SelectionInstruction &insn);
+    void emitCompareInstruction(const SelectionInstruction &insn);
+    void emitJumpInstruction(const SelectionInstruction &insn);
+    void emitIndirectMoveInstruction(const SelectionInstruction &insn);
+    void emitEotInstruction(const SelectionInstruction &insn);
+    void emitNoOpInstruction(const SelectionInstruction &insn);
+    void emitWaitInstruction(const SelectionInstruction &insn);
+    void emitBarrierInstruction(const SelectionInstruction &insn);
+    void emitFenceInstruction(const SelectionInstruction &insn);
+    void emitMathInstruction(const SelectionInstruction &insn);
+    void emitRead64Instruction(const SelectionInstruction &insn);
+    void emitWrite64Instruction(const SelectionInstruction &insn);
+    void emitUntypedReadInstruction(const SelectionInstruction &insn);
+    void emitUntypedWriteInstruction(const SelectionInstruction &insn);
+    void emitAtomicInstruction(const SelectionInstruction &insn);
+    void emitByteGatherInstruction(const SelectionInstruction &insn);
+    void emitByteScatterInstruction(const SelectionInstruction &insn);
+    void emitPackByteInstruction(const SelectionInstruction &insn);
+    void emitUnpackByteInstruction(const SelectionInstruction &insn);
+    void emitDWordGatherInstruction(const SelectionInstruction &insn);
+    void emitSampleInstruction(const SelectionInstruction &insn);
+    void emitTypedWriteInstruction(const SelectionInstruction &insn);
+    void emitSpillRegInstruction(const SelectionInstruction &insn);
+    void emitUnSpillRegInstruction(const SelectionInstruction &insn);
+    void emitGetImageInfoInstruction(const SelectionInstruction &insn);
+    void emitI64MULInstruction(const SelectionInstruction &insn);
+    void emitI64DIVREMInstruction(const SelectionInstruction &insn);
+    void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
+    void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
+
+    /*! Implements base class */
+    virtual Kernel *allocateKernel(void);
+    /*! Store the position of each label instruction in the Gen ISA stream */
+    map<ir::LabelIndex, uint32_t> labelPos;
+    typedef struct LabelPair {
+      LabelPair(ir::LabelIndex l0, ir::LabelIndex l1) :
+                l0(l0), l1(l1){};
+      ir::LabelIndex l0;
+      ir::LabelIndex l1;
+    } LabelPair;
+    /*! Store the Gen instructions to patch */
+    vector<std::pair<LabelPair, uint32_t>> branchPos3;
+    vector<std::pair<ir::LabelIndex, uint32_t>> branchPos2;
+
+    void insertJumpPos(const SelectionInstruction &insn);
+    /*! Encode Gen ISA */
+    GenEncoder *p;
+    /*! Instruction selection on Gen ISA (pre-register allocation) */
+    Selection *sel;
+    /*! Perform the register allocation */
+    GenRegAllocator *ra;
+    /*! Indicate if we need to tackle a register pressure issue when
+     * regenerating the code
+     */
+    uint32_t reservedSpillRegs;
+    bool limitRegisterPressure;
+    bool relaxMath;
+    const bool getIFENDIFFix(void) const { return ifEndifFix; }
+    void setIFENDIFFix(bool fix) { ifEndifFix = fix; }
+    const CompileErrorCode getErrCode() { return errCode; }
+
+  protected:
+    virtual GenEncoder* generateEncoder(void) {
+      return GBE_NEW(GenEncoder, this->simdWidth, 7, deviceID);
+    }
+    /*! allocate a new curbe register and insert to curbe pool. */
+    void allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue = 0);
+
+  private:
+    CompileErrorCode errCode;
+    bool ifEndifFix;
+    /*! Build the curbe patch list for the given kernel */
+    void buildPatchList(void);
+    /*! Calc the group's slm offset from R0.0, to work around HSW SLM bug*/
+    virtual void emitSLMOffset(void) { };
+    /*! allocate group's slm offset in curbe, only for HSW */
+    virtual void allocSLMOffsetCurbe(void) { };
+    /*! new selection of device */
+    virtual void newSelection(void);
+    friend class GenRegAllocator;               //!< need to access errCode directly.
+
+  };
+
+} /* namespace gbe */
+
+#endif /* __GBE_GEN_CONTEXT_HPP__ */
+
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
new file mode 100644
index 0000000..f0da50a
--- /dev/null
+++ b/backend/src/backend/gen_defs.hpp
@@ -0,0 +1,974 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith at tungstengraphics.com>
+  */
+
+#ifndef __GEN_DEFS_HPP__
+#define __GEN_DEFS_HPP__
+
+#include <stdint.h>
+
+/////////////////////////////////////////////////////////////////////////////
+// Gen EU defines
+/////////////////////////////////////////////////////////////////////////////
+
+/* Execution Unit (EU) defines */
+#define GEN_ALIGN_1   0
+#define GEN_ALIGN_16  1
+
+#define GEN_REG_SIZE 32
+
+#define GEN_ADDRESS_DIRECT                        0
+#define GEN_ADDRESS_REGISTER_INDIRECT_REGISTER    1
+
+#define GEN_CHANNEL_X     0
+#define GEN_CHANNEL_Y     1
+#define GEN_CHANNEL_Z     2
+#define GEN_CHANNEL_W     3
+
+#define GEN_COMPRESSION_Q1  0
+#define GEN_COMPRESSION_Q2  1
+#define GEN_COMPRESSION_Q3  2
+#define GEN_COMPRESSION_Q4  3
+#define GEN_COMPRESSION_H1  0
+#define GEN_COMPRESSION_H2  2
+
+#define GEN_CONDITIONAL_NONE  0
+#define GEN_CONDITIONAL_Z     1
+#define GEN_CONDITIONAL_NZ    2
+#define GEN_CONDITIONAL_EQ    1 /* Z */
+#define GEN_CONDITIONAL_NEQ   2 /* NZ */
+#define GEN_CONDITIONAL_G     3
+#define GEN_CONDITIONAL_GE    4
+#define GEN_CONDITIONAL_L     5
+#define GEN_CONDITIONAL_LE    6
+#define GEN_CONDITIONAL_R     7
+#define GEN_CONDITIONAL_O     8
+#define GEN_CONDITIONAL_U     9
+
+#define GEN_DEBUG_NONE        0
+#define GEN_DEBUG_BREAKPOINT  1
+
+#define GEN_DEPENDENCY_NORMAL         0
+#define GEN_DEPENDENCY_NOTCLEARED     1
+#define GEN_DEPENDENCY_NOTCHECKED     2
+#define GEN_DEPENDENCY_DISABLE        3
+
+#define GEN_HORIZONTAL_STRIDE_0   0
+#define GEN_HORIZONTAL_STRIDE_1   1
+#define GEN_HORIZONTAL_STRIDE_2   2
+#define GEN_HORIZONTAL_STRIDE_4   3
+
+#define GEN_INSTRUCTION_NORMAL    0
+#define GEN_INSTRUCTION_SATURATE  1
+
+#define GEN_MASK_ENABLE   0
+#define GEN_MASK_DISABLE  1
+
+/*! Gen opcode */
+enum opcode {
+  GEN_OPCODE_MOV = 1,
+  GEN_OPCODE_SEL = 2,
+  GEN_OPCODE_NOT = 4,
+  GEN_OPCODE_AND = 5,
+  GEN_OPCODE_OR = 6,
+  GEN_OPCODE_XOR = 7,
+  GEN_OPCODE_SHR = 8,
+  GEN_OPCODE_SHL = 9,
+  GEN_OPCODE_RSR = 10,
+  GEN_OPCODE_RSL = 11,
+  GEN_OPCODE_ASR = 12,
+  GEN_OPCODE_CMP = 16,
+  GEN_OPCODE_CMPN = 17,
+  GEN_OPCODE_F32TO16 = 19,
+  GEN_OPCODE_F16TO32 = 20,
+  GEN_OPCODE_JMPI = 32,
+  GEN_OPCODE_BRD = 33,
+  GEN_OPCODE_IF = 34,
+  GEN_OPCODE_BRC = 35,
+  GEN_OPCODE_ELSE = 36,
+  GEN_OPCODE_ENDIF = 37,
+  GEN_OPCODE_DO = 38,
+  GEN_OPCODE_WHILE = 39,
+  GEN_OPCODE_BREAK = 40,
+  GEN_OPCODE_CONTINUE = 41,
+  GEN_OPCODE_HALT = 42,
+  GEN_OPCODE_MSAVE = 44,
+  GEN_OPCODE_MRESTORE = 45,
+  GEN_OPCODE_PUSH = 46,
+  GEN_OPCODE_POP = 47,
+  GEN_OPCODE_WAIT = 48,
+  GEN_OPCODE_SEND = 49,
+  GEN_OPCODE_SENDC = 50,
+  GEN_OPCODE_MATH = 56,
+  GEN_OPCODE_ADD = 64,
+  GEN_OPCODE_MUL = 65,
+  GEN_OPCODE_AVG = 66,
+  GEN_OPCODE_FRC = 67,
+  GEN_OPCODE_RNDU = 68,
+  GEN_OPCODE_RNDD = 69,
+  GEN_OPCODE_RNDE = 70,
+  GEN_OPCODE_RNDZ = 71,
+  GEN_OPCODE_MAC = 72,
+  GEN_OPCODE_MACH = 73,
+  GEN_OPCODE_LZD = 74,
+  GEN_OPCODE_FBH = 75,
+  GEN_OPCODE_FBL = 76,
+  GEN_OPCODE_ADDC = 78,
+  GEN_OPCODE_SUBB = 79,
+  GEN_OPCODE_SAD2 = 80,
+  GEN_OPCODE_SADA2 = 81,
+  GEN_OPCODE_DP4 = 84,
+  GEN_OPCODE_DPH = 85,
+  GEN_OPCODE_DP3 = 86,
+  GEN_OPCODE_DP2 = 87,
+  GEN_OPCODE_DPA2 = 88,
+  GEN_OPCODE_LINE = 89,
+  GEN_OPCODE_PLN = 90,
+  GEN_OPCODE_MAD = 91,
+  GEN_OPCODE_NOP = 126,
+};
+
+#define GEN_ATOMIC_SIMD16   0
+#define GEN_ATOMIC_SIMD8    1
+
+enum GenAtomicOpCode {
+  GEN_ATOMIC_OP_CMPWR8B   = 0,
+  GEN_ATOMIC_OP_AND       = 1,
+  GEN_ATOMIC_OP_OR        = 2,
+  GEN_ATOMIC_OP_XOR       = 3,
+  GEN_ATOMIC_OP_MOV       = 4,
+  GEN_ATOMIC_OP_INC       = 5,
+  GEN_ATOMIC_OP_DEC       = 6,
+  GEN_ATOMIC_OP_ADD       = 7,
+  GEN_ATOMIC_OP_SUB       = 8,
+  GEN_ATOMIC_OP_REVSUB    = 9,
+  GEN_ATOMIC_OP_IMAX      = 10,
+  GEN_ATOMIC_OP_IMIN      = 11,
+  GEN_ATOMIC_OP_UMAX      = 12,
+  GEN_ATOMIC_OP_UMIN      = 13,
+  GEN_ATOMIC_OP_CMPWR     = 14,
+  GEN_ATOMIC_OP_PREDEC    = 15
+};
+
+/*! Gen SFID */
+enum GenMessageTarget {
+  GEN_SFID_NULL                     = 0,
+  GEN_SFID_MATH                     = 1,
+  GEN_SFID_SAMPLER                  = 2,
+  GEN_SFID_MESSAGE_GATEWAY          = 3,
+  GEN_SFID_DATAPORT_READ            = 4,
+  GEN_SFID_DATAPORT_WRITE           = 5,
+  GEN_SFID_URB                      = 6,
+  GEN_SFID_THREAD_SPAWNER           = 7,
+  GEN6_SFID_DATAPORT_SAMPLER_CACHE  = 4,
+  GEN6_SFID_DATAPORT_RENDER_CACHE   = 5,
+  GEN6_SFID_DATAPORT_CONSTANT_CACHE = 9,
+  GEN_SFID_DATAPORT_DATA_CACHE      = 10,
+  GEN_SFID_DATAPORT1_DATA_CACHE     = 12,
+};
+
+#define GEN_PREDICATE_NONE                    0
+#define GEN_PREDICATE_NORMAL                  1
+#define GEN_PREDICATE_ALIGN1_ANYV             2
+#define GEN_PREDICATE_ALIGN1_ALLV             3
+#define GEN_PREDICATE_ALIGN1_ANY2H            4
+#define GEN_PREDICATE_ALIGN1_ALL2H            5
+#define GEN_PREDICATE_ALIGN1_ANY4H            6
+#define GEN_PREDICATE_ALIGN1_ALL4H            7
+#define GEN_PREDICATE_ALIGN1_ANY8H            8
+#define GEN_PREDICATE_ALIGN1_ALL8H            9
+#define GEN_PREDICATE_ALIGN1_ANY16H           10
+#define GEN_PREDICATE_ALIGN1_ALL16H           11
+#define GEN_PREDICATE_ALIGN16_REPLICATE_X     2
+#define GEN_PREDICATE_ALIGN16_REPLICATE_Y     3
+#define GEN_PREDICATE_ALIGN16_REPLICATE_Z     4
+#define GEN_PREDICATE_ALIGN16_REPLICATE_W     5
+#define GEN_PREDICATE_ALIGN16_ANY4H           6
+#define GEN_PREDICATE_ALIGN16_ALL4H           7
+
+#define GEN_ARCHITECTURE_REGISTER_FILE        0
+#define GEN_GENERAL_REGISTER_FILE             1
+#define GEN_IMMEDIATE_VALUE                   3
+
+#define GEN_TYPE_UD  0
+#define GEN_TYPE_D   1
+#define GEN_TYPE_UW  2
+#define GEN_TYPE_W   3
+#define GEN_TYPE_UB  4
+#define GEN_TYPE_B   5
+#define GEN_TYPE_VF  5 /* packed float vector, immediates only? */
+#define GEN_TYPE_HF  6
+#define GEN_TYPE_V   6 /* packed int vector, immediates only, uword dest only */
+#define GEN_TYPE_DF  6
+#define GEN_TYPE_F   7
+#define GEN_TYPE_UL  8
+#define GEN_TYPE_L   9
+
+#define GEN_ARF_NULL                  0x00
+#define GEN_ARF_ADDRESS               0x10
+#define GEN_ARF_ACCUMULATOR           0x20
+#define GEN_ARF_FLAG                  0x30
+#define GEN_ARF_MASK                  0x40
+#define GEN_ARF_MASK_STACK            0x50
+#define GEN_ARF_MASK_STACK_DEPTH      0x60
+#define GEN_ARF_STATE                 0x70
+#define GEN_ARF_CONTROL               0x80
+#define GEN_ARF_NOTIFICATION_COUNT    0x90
+#define GEN_ARF_IP                    0xA0
+
+#define GEN_MRF_COMPR4   (1 << 7)
+
+#define GEN_AMASK   0
+#define GEN_IMASK   1
+#define GEN_LMASK   2
+#define GEN_CMASK   3
+
+#define GEN_THREAD_NORMAL     0
+#define GEN_THREAD_ATOMIC     1
+#define GEN_THREAD_SWITCH     2
+
+#define GEN_VERTICAL_STRIDE_0                 0
+#define GEN_VERTICAL_STRIDE_1                 1
+#define GEN_VERTICAL_STRIDE_2                 2
+#define GEN_VERTICAL_STRIDE_4                 3
+#define GEN_VERTICAL_STRIDE_8                 4
+#define GEN_VERTICAL_STRIDE_16                5
+#define GEN_VERTICAL_STRIDE_32                6
+#define GEN_VERTICAL_STRIDE_64                7
+#define GEN_VERTICAL_STRIDE_128               8
+#define GEN_VERTICAL_STRIDE_256               9
+#define GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL   0xF
+
+/* Execution width */
+#define GEN_WIDTH_1       0
+#define GEN_WIDTH_2       1
+#define GEN_WIDTH_4       2
+#define GEN_WIDTH_8       3
+#define GEN_WIDTH_16      4
+#define GEN_WIDTH_32      5
+
+/* Channels to enable for the untyped reads and writes */
+#define GEN_UNTYPED_RED   (1 << 0)
+#define GEN_UNTYPED_GREEN (1 << 1)
+#define GEN_UNTYPED_BLUE  (1 << 2)
+#define GEN_UNTYPED_ALPHA (1 << 3)
+
+/* SIMD mode for untyped reads and writes */
+#define GEN_UNTYPED_SIMD4x2 0
+#define GEN_UNTYPED_SIMD16  1
+#define GEN_UNTYPED_SIMD8   2
+
+/* SIMD mode for byte scatters / gathers */
+#define GEN_BYTE_SCATTER_SIMD8    0
+#define GEN_BYTE_SCATTER_SIMD16   1
+
+/* Data port message type for gen7*/
+#define GEN7_OBLOCK_READ           0 //0000: OWord Block Read
+#define GEN7_UNALIGNED_OBLOCK_READ 1 //0001: Unaligned OWord Block Read
+#define GEN7_ODBLOCK_READ          2 //0010: OWord Dual Block Read
+#define GEN7_DWORD_GATHER          3 //0011: DWord Scattered Read
+#define GEN7_BYTE_GATHER           4 //0100: Byte Scattered Read
+#define GEN7_UNTYPED_READ          5 //0101: Untyped Surface Read
+#define GEN7_UNTYPED_ATOMIC_READ   6 //0110: Untyped Atomic Operation
+#define GEN7_MEMORY_FENCE          7 //0111: Memory Fence
+#define GEN7_OBLOCK_WRITE          8 //1000: OWord Block Write
+#define GEN7_ODBLOCK_WRITE         10//1010: OWord Dual Block Write
+#define GEN7_DWORD_SCATTER         11//1011: DWord Scattered Write
+#define GEN7_BYTE_SCATTER          12//1100: Byte Scattered Write
+#define GEN7_UNTYPED_WRITE         13//1101: Untyped Surface Write
+
+/* Data port0 message type for Gen75*/
+#define GEN75_P0_OBLOCK_READ            0 //0000: OWord Block Read
+#define GEN75_P0_UNALIGNED_OBLOCK_READ  1 //0001: Unaligned OWord Block Read
+#define GEN75_P0_ODBLOCK_READ           2 //0010: OWord Dual Block Read
+#define GEN75_P0_DWORD_GATHER           3 //0011: DWord Scattered Read
+#define GEN75_P0_BYTE_GATHER            4 //0100: Byte Scattered Read
+#define GEN75_P0_MEMORY_FENCE           7 //0111: Memory Fence
+#define GEN75_P0_OBLOCK_WRITE           8 //1000: OWord Block Write
+#define GEN75_P0_ODBLOCK_WRITE         10 //1010: OWord Dual Block Write
+#define GEN75_P0_DWORD_SCATTER         11 //1011: DWord Scattered Write
+#define GEN75_P0_BYTE_SCATTER          12 //1100: Byte Scattered Write
+
+/* Data port1 message type for Gen75*/
+#define GEN75_P1_UNTYPED_READ           1 //0001: Untyped Surface Read
+#define GEN75_P1_UNTYPED_ATOMIC_OP      2 //0010: Untyped Atomic Operation
+#define GEN75_P1_UNTYPED_ATOMIC_OP_4X2  3 //0011: Untyped Atomic Operation SIMD4x2
+#define GEN75_P1_MEDIA_BREAD            4 //0100: Media Block Read
+#define GEN75_P1_TYPED_SURFACE_READ     5 //0101: Typed Surface Read
+#define GEN75_P1_TYPED_ATOMIC_OP        6 //0110: Typed Atomic Operation
+#define GEN75_P1_TYPED_ATOMIC_OP_4X2    7 //0111: Typed Atomic Operation SIMD4x2
+#define GEN75_P1_UNTYPED_SURFACE_WRITE  9 //1001: Untyped Surface Write
+#define GEN75_P1_MEDIA_TYPED_BWRITE    10 //1010: Media Block Write
+#define GEN75_P1_ATOMIC_COUNTER        11 //1011: Atomic Counter Operation
+#define GEN75_P1_ATOMIC_COUNTER_4X2    12 //1100: Atomic Counter Operation 4X2
+#define GEN75_P1_TYPED_SURFACE_WRITE   13 //1101: Typed Surface Write
+
+/* Data port data cache scratch messages*/
+#define GEN_SCRATCH_READ                  0
+#define GEN_SCRATCH_WRITE                 1
+#define GEN_SCRATCH_CHANNEL_MODE_OWORD    0
+#define GEN_SCRATCH_CHANNEL_MODE_DWORD    1
+#define GEN_SCRATCH_BLOCK_SIZE_1          0
+#define GEN_SCRATCH_BLOCK_SIZE_2          1
+#define GEN_SCRATCH_BLOCK_SIZE_4          3
+
+/* Data port render cache Message Type*/
+#define GEN_MBLOCK_READ           4  //0100: Media Block Read
+#define GEN_TYPED_READ            5  //0101: Typed Surface Read
+#define GEN_TYPED_ATOMIC          6  //0110: Typed Atomic Operation
+#define GEN_MEM_FENCE             7  //0111: Memory Fence
+#define GEN_MBLOCK_WRITE          10 //1010: Media Block Write
+#define GEN_RENDER_WRITE          12 //1100: Render Target Write
+#define GEN_TYPED_WRITE           13 //1101: Typed Surface Write
+
+/* For byte scatters and gathers, the element to write */
+#define GEN_BYTE_SCATTER_BYTE   0
+#define GEN_BYTE_SCATTER_WORD   1
+#define GEN_BYTE_SCATTER_DWORD  2
+#define GEN_BYTE_SCATTER_QWORD  3
+
+/* dword scattered rw */
+#define GEN_DWORD_SCATTER_8_DWORDS   2
+#define GEN_DWORD_SCATTER_16_DWORDS  3
+
+#define GEN_SAMPLER_RETURN_FORMAT_FLOAT32     0
+#define GEN_SAMPLER_RETURN_FORMAT_UINT32      2
+#define GEN_SAMPLER_RETURN_FORMAT_SINT32      3
+
+#define GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE              0
+#define GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE             0
+#define GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS        0
+#define GEN_SAMPLER_MESSAGE_SIMD8_KILLPIX             1
+#define GEN_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD        1
+#define GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD         1
+#define GEN_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS  2
+#define GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS    2
+#define GEN_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE    0
+#define GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE     2
+#define GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE 0
+#define GEN_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE 1
+#define GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE  1
+#define GEN_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
+#define GEN_SAMPLER_MESSAGE_SIMD16_RESINFO            2
+#define GEN_SAMPLER_MESSAGE_SIMD4X2_LD                7
+#define GEN_SAMPLER_MESSAGE_SIMD8_LD                  7
+#define GEN_SAMPLER_MESSAGE_SIMD16_LD                 7
+
+#define GEN5_SAMPLER_MESSAGE_SAMPLE              0
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS         1
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD          2
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE      3
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS       4
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE 5
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE  6
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LD           7
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO      10
+
+/* for GEN5 only */
+#define GEN_SAMPLER_SIMD_MODE_SIMD4X2                   0
+#define GEN_SAMPLER_SIMD_MODE_SIMD8                     1
+#define GEN_SAMPLER_SIMD_MODE_SIMD16                    2
+#define GEN_SAMPLER_SIMD_MODE_SIMD32_64                 3
+
+#define GEN_MATH_FUNCTION_INV                              1
+#define GEN_MATH_FUNCTION_LOG                              2
+#define GEN_MATH_FUNCTION_EXP                              3
+#define GEN_MATH_FUNCTION_SQRT                             4
+#define GEN_MATH_FUNCTION_RSQ                              5
+#define GEN_MATH_FUNCTION_SIN                              6 /* was 7 */
+#define GEN_MATH_FUNCTION_COS                              7 /* was 8 */
+#define GEN_MATH_FUNCTION_FDIV                             9 /* gen6+ */
+#define GEN_MATH_FUNCTION_POW                              10
+#define GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
+#define GEN_MATH_FUNCTION_INT_DIV_QUOTIENT                 12
+#define GEN_MATH_FUNCTION_INT_DIV_REMAINDER                13
+
+#define GEN_MATH_INTEGER_UNSIGNED     0
+#define GEN_MATH_INTEGER_SIGNED       1
+
+#define GEN_MATH_PRECISION_FULL        0
+#define GEN_MATH_PRECISION_PARTIAL     1
+
+#define GEN_MATH_SATURATE_NONE         0
+#define GEN_MATH_SATURATE_SATURATE     1
+
+#define GEN_MATH_DATA_VECTOR  0
+#define GEN_MATH_DATA_SCALAR  1
+
+#define GEN_DEREFERENCE_URB 0
+#define GEN_DO_NOT_DEREFERENCE_URB 1
+
+#define GEN_MAX_NUM_BUFFER_ENTRIES (1 << 27)
+
+/* Message gateway */
+#define GEN_OPEN_GATEWAY          0b000
+#define GEN_CLOSE_GATEWAY         0b001
+#define GEN_FORWARD_MSG           0b010
+#define GEN_GET_TIME_STAMP        0b011
+#define GEN_BARRIER_MSG           0b100
+#define GEN_UPDATE_GATEWAT_STATE  0b101
+#define GEN_MMIO_READ_WRITE       0b110
+
+/////////////////////////////////////////////////////////////////////////////
+// Gen EU structures
+/////////////////////////////////////////////////////////////////////////////
+
+/** Number of general purpose registers (VS, WM, etc) */
+#define GEN_MAX_GRF 128
+
+/* Instruction format for the execution units */
+
+struct GenInstruction {
+  uint32_t low;
+  uint32_t high;
+};
+
+union GenCompactInstruction {
+  struct GenInstruction low;
+  struct {
+    struct {
+      uint32_t opcode:7;
+      uint32_t debug_control:1;
+      uint32_t control_index:5;
+      uint32_t data_type_index:5;
+      uint32_t sub_reg_index:5;
+      uint32_t acc_wr_control:1;
+      uint32_t destreg_or_condmod:4;
+      uint32_t pad:1;
+      uint32_t cmpt_control:1;
+      uint32_t src0_index_lo:2;
+    } bits1;
+    struct {
+      uint32_t src0_index_hi:3;
+      uint32_t src1_index:5;
+      uint32_t dest_reg_nr:8;
+      uint32_t src0_reg_nr:8;
+      uint32_t src1_reg_nr:8;
+    } bits2;
+  };
+};
+
+union GenNativeInstruction
+{
+  struct {
+    struct GenInstruction low;
+    struct GenInstruction high;
+  };
+  struct {
+    struct {
+      uint32_t opcode:7;
+      uint32_t pad:1;
+      uint32_t access_mode:1;
+      uint32_t mask_control:1;
+      uint32_t dependency_control:2;
+      uint32_t quarter_control:2;
+      uint32_t thread_control:2;
+      uint32_t predicate_control:4;
+      uint32_t predicate_inverse:1;
+      uint32_t execution_size:3;
+      uint32_t destreg_or_condmod:4;
+      uint32_t acc_wr_control:1;
+      uint32_t cmpt_control:1;
+      uint32_t debug_control:1;
+      uint32_t saturate:1;
+    } header;
+
+    union {
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t src1_reg_file:2;
+        uint32_t src1_reg_type:3;
+        uint32_t nib_ctrl:1;
+        uint32_t dest_subreg_nr:5;
+        uint32_t dest_reg_nr:8;
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } da1;
+
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t src1_reg_file:2;        /* 0x00000c00 */
+        uint32_t src1_reg_type:3;        /* 0x00007000 */
+        uint32_t nib_ctrl:1;
+        int dest_indirect_offset:10;        /* offset against the deref'd address reg */
+        uint32_t dest_subreg_nr:3; /* subnr for the address reg a0.x */
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } ia1;
+
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t src1_reg_file:2;
+        uint32_t src1_reg_type:3;
+        uint32_t nib_ctrl:1;
+        uint32_t dest_writemask:4;
+        uint32_t dest_subreg_nr:1;
+        uint32_t dest_reg_nr:8;
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } da16;
+
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t nib_ctrl:1;
+        uint32_t dest_writemask:4;
+        int dest_indirect_offset:6;
+        uint32_t dest_subreg_nr:3;
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } ia16;
+
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t src1_reg_file:2;
+        uint32_t src1_reg_type:3;
+        uint32_t pad:1;
+        int jump_count:16;
+      } branch_gen6;
+
+      struct {
+        uint32_t dest_reg_file:1;
+        uint32_t flag_subreg_num:1;
+        uint32_t pad0:2;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src2_abs:1;
+        uint32_t src2_negate:1;
+        uint32_t pad1:7;
+        uint32_t dest_writemask:4;
+        uint32_t dest_subreg_nr:3;
+        uint32_t dest_reg_nr:8;
+      } da3src;
+    } bits1;
+
+    union {
+      struct {
+        uint32_t src0_subreg_nr:5;
+        uint32_t src0_reg_nr:8;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_horiz_stride:2;
+        uint32_t src0_width:3;
+        uint32_t src0_vert_stride:4;
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t pad:5;
+      } da1;
+
+      struct {
+        int src0_indirect_offset:10;
+        uint32_t src0_subreg_nr:3;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_horiz_stride:2;
+        uint32_t src0_width:3;
+        uint32_t src0_vert_stride:4;
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t pad:5;
+      } ia1;
+
+      struct {
+        uint32_t src0_swz_x:2;
+        uint32_t src0_swz_y:2;
+        uint32_t src0_subreg_nr:1;
+        uint32_t src0_reg_nr:8;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_swz_z:2;
+        uint32_t src0_swz_w:2;
+        uint32_t pad0:1;
+        uint32_t src0_vert_stride:4;
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t pad:5;
+      } da16;
+
+      struct {
+        uint32_t src0_swz_x:2;
+        uint32_t src0_swz_y:2;
+        int src0_indirect_offset:6;
+        uint32_t src0_subreg_nr:3;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_swz_z:2;
+        uint32_t src0_swz_w:2;
+        uint32_t pad0:1;
+        uint32_t src0_vert_stride:4;
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t pad:5;
+      } ia16;
+
+      struct {
+        uint32_t src0_rep_ctrl:1;
+        uint32_t src0_swizzle:8;
+        uint32_t src0_subreg_nr:3;
+        uint32_t src0_reg_nr:8;
+        uint32_t pad0:1;
+        uint32_t src1_rep_ctrl:1;
+        uint32_t src1_swizzle:8;
+        uint32_t src1_subreg_nr_low:2;
+      } da3src;
+    } bits2;
+
+    union {
+      struct {
+        uint32_t src1_subreg_nr:5;
+        uint32_t src1_reg_nr:8;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src1_address_mode:1;
+        uint32_t src1_horiz_stride:2;
+        uint32_t src1_width:3;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad0:7;
+      } da1;
+
+      struct {
+        uint32_t src1_swz_x:2;
+        uint32_t src1_swz_y:2;
+        uint32_t src1_subreg_nr:1;
+        uint32_t src1_reg_nr:8;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src1_address_mode:1;
+        uint32_t src1_swz_z:2;
+        uint32_t src1_swz_w:2;
+        uint32_t pad1:1;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad2:7;
+      } da16;
+
+      struct {
+        int  src1_indirect_offset:10;
+        uint32_t src1_subreg_nr:3;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src1_address_mode:1;
+        uint32_t src1_horiz_stride:2;
+        uint32_t src1_width:3;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad1:7;
+      } ia1;
+
+      struct {
+        uint32_t src1_swz_x:2;
+        uint32_t src1_swz_y:2;
+        int  src1_indirect_offset:6;
+        uint32_t src1_subreg_nr:3;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t pad0:1;
+        uint32_t src1_swz_z:2;
+        uint32_t src1_swz_w:2;
+        uint32_t pad1:1;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad2:7;
+      } ia16;
+
+      struct {
+        uint32_t function_control:19;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } generic_gen5;
+
+      struct {
+        uint32_t sub_function_id:3;
+        uint32_t pad0:11;
+        uint32_t ack_req:1;
+        uint32_t notify:2;
+        uint32_t pad1:2;
+        uint32_t header:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } msg_gateway;
+
+      struct {
+        uint32_t opcode:1;
+        uint32_t request:1;
+        uint32_t pad0:2;
+        uint32_t resource:1;
+        uint32_t pad1:14;
+        uint32_t header:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } spawner_gen5;
+
+      /** Ironlake PRM, Volume 4 Part 1, Section 6.1.1.1 */
+      struct {
+        uint32_t function:4;
+        uint32_t int_type:1;
+        uint32_t precision:1;
+        uint32_t saturate:1;
+        uint32_t data_type:1;
+        uint32_t snapshot:1;
+        uint32_t pad0:10;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } math_gen5;
+
+      struct {
+        uint32_t bti:8;
+        uint32_t sampler:4;
+        uint32_t msg_type:5;
+        uint32_t simd_mode:2;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } sampler_gen7;
+
+      /**
+       * Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
+       *
+       * See the Sandybridge PRM, Volume 4 Part 1, Section 3.9.2.1.1.
+       **/
+      struct {
+        uint32_t bti:8;
+        uint32_t msg_control:5;
+        uint32_t msg_type:3;
+        uint32_t pad0:3;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } gen6_dp_sampler_const_cache;
+
+      /*! Data port untyped read / write messages */
+      struct {
+        uint32_t bti:8;
+        uint32_t rgba:4;
+        uint32_t simd_mode:2;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_untyped_rw;
+
+      /*! Data port byte scatter / gather */
+      struct {
+        uint32_t bti:8;
+        uint32_t simd_mode:1;
+        uint32_t ignored0:1;
+        uint32_t data_size:2;
+        uint32_t ignored1:2;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_byte_rw;
+
+      /*! Data port Scratch Read/ write */
+      struct {
+        uint32_t offset:12;
+        uint32_t block_size:2;
+        uint32_t ignored0:1;
+        uint32_t invalidate_after_read:1;
+        uint32_t channel_mode:1;
+        uint32_t msg_type:1;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_scratch_rw;
+
+      /*! Data port OBlock read / write */
+      struct {
+        uint32_t bti:8;
+        uint32_t block_size:3;
+        uint32_t ignored:2;
+        uint32_t invalidate_after_read:1;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_oblock_rw;
+
+      /*! Data port dword scatter / gather */
+      struct {
+        uint32_t bti:8;
+        uint32_t block_size:2;
+        uint32_t ignored0:3;
+        uint32_t invalidate_after_read:1;
+        uint32_t msg_type:4;
+        uint32_t ignored1:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_dword_rw;
+
+      /*! Data port typed read / write messages */
+      struct {
+        uint32_t bti:8;
+        uint32_t chan_mask:4;
+        uint32_t slot:2;
+        uint32_t msg_type:4;
+        uint32_t pad2:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_typed_rw;
+
+      /*! Memory fence */
+      struct {
+        uint32_t bti:8;
+        uint32_t pad:5;
+        uint32_t commit_enable:1;
+        uint32_t msg_type:4;
+        uint32_t pad2:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_memory_fence;
+
+      /*! atomic messages */
+      struct {
+        uint32_t bti:8;
+        uint32_t aop_type:4;
+        uint32_t simd_mode:1;
+        uint32_t return_data:1;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_atomic_op;
+
+      struct {
+        uint32_t src1_subreg_nr_high:1;
+        uint32_t src1_reg_nr:8;
+        uint32_t pad0:1;
+        uint32_t src2_rep_ctrl:1;
+        uint32_t src2_swizzle:8;
+        uint32_t src2_subreg_nr:3;
+        uint32_t src2_reg_nr:8;
+        uint32_t pad1:2;
+      } da3src;
+
+      /*! Message gateway */
+      struct {
+        uint32_t subfunc:3;
+        uint32_t pad:11;
+        uint32_t ackreq:1;
+        uint32_t notify:2;
+        uint32_t pad2:2;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_msg_gw;
+
+      struct {
+        uint32_t jip:16;
+        uint32_t uip:16;
+      } gen7_branch;
+
+      int d;
+      uint32_t ud;
+      float f;
+    } bits3;
+  };
+};
+
+#endif /* __GEN_DEFS_HPP__ */
+
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
new file mode 100644
index 0000000..182752a
--- /dev/null
+++ b/backend/src/backend/gen_encoder.cpp
@@ -0,0 +1,1311 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith at tungstengraphics.com>
+  */
+
+#include "backend/gen_encoder.hpp"
+#include <cstring>
+
+
+namespace gbe
+{
+  extern bool compactAlu2(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1, uint32_t condition, bool split);
+  extern bool compactAlu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src, uint32_t condition, bool split);
+  //////////////////////////////////////////////////////////////////////////
+  // Some helper functions to encode
+  //////////////////////////////////////////////////////////////////////////
+  INLINE bool isVectorOfBytes(GenRegister reg) {
+    if (reg.hstride != GEN_HORIZONTAL_STRIDE_0 &&
+        (reg.type == GEN_TYPE_UB || reg.type == GEN_TYPE_B))
+      return true;
+    else
+      return false;
+  }
+
+  INLINE bool needToSplitAlu1(GenEncoder *p, GenRegister dst, GenRegister src) {
+    if (p->curr.execWidth != 16 || src.hstride == GEN_HORIZONTAL_STRIDE_0) return false;
+    if (isVectorOfBytes(dst) == true) return true;
+    if (isVectorOfBytes(src) == true) return true;
+    return false;
+  }
+
+  INLINE bool needToSplitAlu2(GenEncoder *p, GenRegister dst, GenRegister src0, GenRegister src1) {
+    if (p->curr.execWidth != 16 ||
+         (src0.hstride == GEN_HORIZONTAL_STRIDE_0 &&
+          src1.hstride == GEN_HORIZONTAL_STRIDE_0))
+      return false;
+    if (isVectorOfBytes(dst) == true) return true;
+    if (isVectorOfBytes(src0) == true) return true;
+    if (isVectorOfBytes(src1) == true) return true;
+    return false;
+  }
+
+  INLINE bool needToSplitCmp(GenEncoder *p, GenRegister src0, GenRegister src1) {
+    if (p->curr.execWidth != 16 ||
+         (src0.hstride == GEN_HORIZONTAL_STRIDE_0 &&
+          src1.hstride == GEN_HORIZONTAL_STRIDE_0))
+      return false;
+    if (isVectorOfBytes(src0) == true) return true;
+    if (isVectorOfBytes(src1) == true) return true;
+    if (src0.type == GEN_TYPE_D || src0.type == GEN_TYPE_UD || src0.type == GEN_TYPE_F)
+      return true;
+    if (src1.type == GEN_TYPE_D || src1.type == GEN_TYPE_UD || src1.type == GEN_TYPE_F)
+      return true;
+    return false;
+  }
+
+  void GenEncoder::setMessageDescriptor(GenNativeInstruction *inst, enum GenMessageTarget sfid,
+                                        unsigned msg_length, unsigned response_length,
+                                        bool header_present, bool end_of_thread)
+  {
+     setSrc1(inst, GenRegister::immd(0));
+     inst->bits3.generic_gen5.header_present = header_present;
+     inst->bits3.generic_gen5.response_length = response_length;
+     inst->bits3.generic_gen5.msg_length = msg_length;
+     inst->bits3.generic_gen5.end_of_thread = end_of_thread;
+     inst->header.destreg_or_condmod = sfid;
+  }
+
+  void GenEncoder::setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
+                                        unsigned char msg_type, uint32_t msg_length,
+                                        bool header_present)
+  {
+    const GenMessageTarget sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+    setMessageDescriptor(insn, sfid, msg_length, 0, header_present);
+    insn->bits3.gen7_typed_rw.bti = bti;
+    insn->bits3.gen7_typed_rw.msg_type = msg_type;
+  }
+
+  void GenEncoder::setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti,
+                                  uint32_t rgba, uint32_t msg_type,
+                                  uint32_t msg_length, uint32_t response_length)
+  {
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_untyped_rw.msg_type = msg_type;
+    insn->bits3.gen7_untyped_rw.bti = bti;
+    insn->bits3.gen7_untyped_rw.rgba = rgba;
+    if (curr.execWidth == 8)
+      insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD8;
+    else if (curr.execWidth == 16)
+      insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD16;
+    else
+      NOT_SUPPORTED;
+  }
+
+  static void setDPByteScatterGather(GenEncoder *p,
+                                     GenNativeInstruction *insn,
+                                     uint32_t bti,
+                                     uint32_t elem_size,
+                                     uint32_t msg_type,
+                                     uint32_t msg_length,
+                                     uint32_t response_length)
+  {
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_byte_rw.msg_type = msg_type;
+    insn->bits3.gen7_byte_rw.bti = bti;
+    insn->bits3.gen7_byte_rw.data_size = elem_size;
+    if (p->curr.execWidth == 8)
+      insn->bits3.gen7_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD8;
+    else if (p->curr.execWidth == 16)
+      insn->bits3.gen7_byte_rw.simd_mode = GEN_BYTE_SCATTER_SIMD16;
+    else
+      NOT_SUPPORTED;
+  }
+#if 0
+  static void setOBlockRW(GenEncoder *p,
+                          GenNativeInstruction *insn,
+                          uint32_t bti,
+                          uint32_t size,
+                          uint32_t msg_type,
+                          uint32_t msg_length,
+                          uint32_t response_length)
+  {
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+    assert(size == 2 || size == 4);
+    insn->bits3.gen7_oblock_rw.msg_type = msg_type;
+    insn->bits3.gen7_oblock_rw.bti = bti;
+    insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : 3;
+    insn->bits3.gen7_oblock_rw.header_present = 1;
+  }
+#endif
+
+  static void setSamplerMessage(GenEncoder *p,
+                                GenNativeInstruction *insn,
+                                unsigned char bti,
+                                unsigned char sampler,
+                                uint32_t msg_type,
+                                uint32_t response_length,
+                                uint32_t msg_length,
+                                bool header_present,
+                                uint32_t simd_mode,
+                                uint32_t return_format)
+  {
+     const GenMessageTarget sfid = GEN_SFID_SAMPLER;
+     p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+     insn->bits3.sampler_gen7.bti = bti;
+     insn->bits3.sampler_gen7.sampler = sampler;
+     insn->bits3.sampler_gen7.msg_type = msg_type;
+     insn->bits3.sampler_gen7.simd_mode = simd_mode;
+  }
+
+  static void setDWordScatterMessgae(GenEncoder *p,
+                                     GenNativeInstruction *insn,
+                                     uint32_t bti,
+                                     uint32_t block_size,
+                                     uint32_t msg_type,
+                                     uint32_t msg_length,
+                                     uint32_t response_length)
+  {
+    // FIXME there is a unknown issue with baytrail-t platform, the DWORD scatter
+    // message causes a hang at unit test case compiler_global_constant.
+    // We workaround it to use DATA CACHE instead.
+    const GenMessageTarget sfid = (p->deviceID == PCI_CHIP_BAYTRAIL_T) ?
+                                 GEN_SFID_DATAPORT_DATA_CACHE : GEN6_SFID_DATAPORT_CONSTANT_CACHE;
+    p->setMessageDescriptor(insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_dword_rw.msg_type = msg_type;
+    insn->bits3.gen7_dword_rw.bti = bti;
+    insn->bits3.gen7_dword_rw.block_size = block_size;
+    insn->bits3.gen7_dword_rw.invalidate_after_read = 0;
+  }
+  //////////////////////////////////////////////////////////////////////////
+  // Gen Emitter encoding class
+  //////////////////////////////////////////////////////////////////////////
+  GenEncoder::GenEncoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID) :
+    stateNum(0), gen(gen), deviceID(deviceID)
+  {
+    this->simdWidth = simdWidth;
+    this->curr.execWidth = simdWidth;
+    this->curr.quarterControl = GEN_COMPRESSION_Q1;
+    this->curr.noMask = 0;
+    this->curr.flag = 0;
+    this->curr.subFlag = 0;
+    this->curr.predicate = GEN_PREDICATE_NORMAL;
+    this->curr.inversePredicate = 0;
+  }
+
+  void GenEncoder::push(void) {
+    assert(stateNum < MAX_STATE_NUM);
+    stack[stateNum++] = curr;
+  }
+
+  void GenEncoder::pop(void) {
+    assert(stateNum > 0);
+    curr = stack[--stateNum];
+  }
+
+  void GenEncoder::setHeader(GenNativeInstruction *insn) {
+    if (this->curr.execWidth == 8)
+      insn->header.execution_size = GEN_WIDTH_8;
+    else if (this->curr.execWidth == 16)
+      insn->header.execution_size = GEN_WIDTH_16;
+    else if (this->curr.execWidth == 4)
+      insn->header.execution_size = GEN_WIDTH_4;
+    else if (this->curr.execWidth == 1)
+      insn->header.execution_size = GEN_WIDTH_1;
+    else
+      NOT_IMPLEMENTED;
+    insn->header.acc_wr_control = this->curr.accWrEnable;
+    insn->header.quarter_control = this->curr.quarterControl;
+    insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
+    insn->header.mask_control = this->curr.noMask;
+    insn->bits2.ia1.flag_reg_nr = this->curr.flag;
+    insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+    if (this->curr.predicate != GEN_PREDICATE_NONE) {
+      insn->header.predicate_control = this->curr.predicate;
+      insn->header.predicate_inverse = this->curr.inversePredicate;
+    }
+    insn->header.saturate = this->curr.saturate;
+  }
+
+  void GenEncoder::setDst(GenNativeInstruction *insn, GenRegister dest) {
+     if (dest.file != GEN_ARCHITECTURE_REGISTER_FILE)
+        assert(dest.nr < 128);
+
+     insn->bits1.da1.dest_reg_file = dest.file;
+     insn->bits1.da1.dest_reg_type = dest.type;
+     insn->bits1.da1.dest_address_mode = dest.address_mode;
+     insn->bits1.da1.dest_reg_nr = dest.nr;
+     insn->bits1.da1.dest_subreg_nr = dest.subnr;
+     if (dest.hstride == GEN_HORIZONTAL_STRIDE_0) {
+       if (dest.type == GEN_TYPE_UB || dest.type == GEN_TYPE_B)
+         dest.hstride = GEN_HORIZONTAL_STRIDE_4;
+       else if (dest.type == GEN_TYPE_UW || dest.type == GEN_TYPE_W)
+         dest.hstride = GEN_HORIZONTAL_STRIDE_2;
+       else
+         dest.hstride = GEN_HORIZONTAL_STRIDE_1;
+     }
+     insn->bits1.da1.dest_horiz_stride = dest.hstride;
+  }
+
+  void GenEncoder::setSrc0(GenNativeInstruction *insn, GenRegister reg) {
+     if (reg.file != GEN_ARCHITECTURE_REGISTER_FILE)
+        assert(reg.nr < 128);
+
+     if (reg.address_mode == GEN_ADDRESS_DIRECT) {
+       insn->bits1.da1.src0_reg_file = reg.file;
+       insn->bits1.da1.src0_reg_type = reg.type;
+       insn->bits2.da1.src0_abs = reg.absolute;
+       insn->bits2.da1.src0_negate = reg.negation;
+       insn->bits2.da1.src0_address_mode = reg.address_mode;
+
+       if (reg.file == GEN_IMMEDIATE_VALUE) {
+          insn->bits3.ud = reg.value.ud;
+
+          /* Required to set some fields in src1 as well: */
+          insn->bits1.da1.src1_reg_file = 0; /* arf */
+          insn->bits1.da1.src1_reg_type = reg.type;
+       }
+       else {
+         if (insn->header.access_mode == GEN_ALIGN_1) {
+           insn->bits2.da1.src0_subreg_nr = reg.subnr;
+           insn->bits2.da1.src0_reg_nr = reg.nr;
+         } else {
+           insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
+           insn->bits2.da16.src0_reg_nr = reg.nr;
+         }
+
+         if (reg.width == GEN_WIDTH_1 &&
+             insn->header.execution_size == GEN_WIDTH_1) {
+           insn->bits2.da1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+           insn->bits2.da1.src0_width = GEN_WIDTH_1;
+           insn->bits2.da1.src0_vert_stride = GEN_VERTICAL_STRIDE_0;
+         }
+         else {
+           insn->bits2.da1.src0_horiz_stride = reg.hstride;
+           insn->bits2.da1.src0_width = reg.width;
+           insn->bits2.da1.src0_vert_stride = reg.vstride;
+         }
+       }
+    } else {
+       insn->bits1.ia1.src0_reg_file = GEN_GENERAL_REGISTER_FILE;
+       insn->bits1.ia1.src0_reg_type = reg.type;
+       insn->bits2.ia1.src0_subreg_nr = 0;
+       insn->bits2.ia1.src0_indirect_offset = 0;
+       insn->bits2.ia1.src0_abs = 0;
+       insn->bits2.ia1.src0_negate = 0;
+       insn->bits2.ia1.src0_address_mode = reg.address_mode;
+       insn->bits2.ia1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+       insn->bits2.ia1.src0_width = GEN_WIDTH_1;
+       insn->bits2.ia1.src0_vert_stride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+    }
+  }
+
+  void GenEncoder::setSrc1(GenNativeInstruction *insn, GenRegister reg) {
+     assert(reg.nr < 128);
+     assert(reg.file != GEN_ARCHITECTURE_REGISTER_FILE || reg.nr == 0);
+
+     insn->bits1.da1.src1_reg_file = reg.file;
+     insn->bits1.da1.src1_reg_type = reg.type;
+     insn->bits3.da1.src1_abs = reg.absolute;
+     insn->bits3.da1.src1_negate = reg.negation;
+
+     assert(insn->bits1.da1.src0_reg_file != GEN_IMMEDIATE_VALUE);
+
+     if (reg.file == GEN_IMMEDIATE_VALUE)
+       insn->bits3.ud = reg.value.ud;
+     else {
+       assert (reg.address_mode == GEN_ADDRESS_DIRECT);
+       if (insn->header.access_mode == GEN_ALIGN_1) {
+         insn->bits3.da1.src1_subreg_nr = reg.subnr;
+         insn->bits3.da1.src1_reg_nr = reg.nr;
+       } else {
+         insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
+         insn->bits3.da16.src1_reg_nr = reg.nr;
+       }
+
+       if (reg.width == GEN_WIDTH_1 &&
+           insn->header.execution_size == GEN_WIDTH_1) {
+         insn->bits3.da1.src1_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+         insn->bits3.da1.src1_width = GEN_WIDTH_1;
+         insn->bits3.da1.src1_vert_stride = GEN_VERTICAL_STRIDE_0;
+       } else {
+         insn->bits3.da1.src1_horiz_stride = reg.hstride;
+         insn->bits3.da1.src1_width = reg.width;
+         insn->bits3.da1.src1_vert_stride = reg.vstride;
+       }
+     }
+  }
+
+  static const uint32_t untypedRWMask[] = {
+    GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN|GEN_UNTYPED_RED,
+    GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN,
+    GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE,
+    GEN_UNTYPED_ALPHA,
+    0
+  };
+
+  void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    if (this->curr.execWidth == 8) {
+      msg_length = 1;
+      response_length = elemNum;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2;
+      response_length = 2*elemNum;
+    } else
+      NOT_IMPLEMENTED;
+
+    this->setHeader(insn);
+    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(insn,
+                   bti,
+                   untypedRWMask[elemNum],
+                   GEN7_UNTYPED_READ,
+                   msg_length,
+                   response_length);
+  }
+
+  void GenEncoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    this->setHeader(insn);
+    if (this->curr.execWidth == 8) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+      msg_length = 1+elemNum;
+    } else if (this->curr.execWidth == 16) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+      msg_length = 2*(1+elemNum);
+    }
+    else
+      NOT_IMPLEMENTED;
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(insn,
+                   bti,
+                   untypedRWMask[elemNum],
+                   GEN7_UNTYPED_WRITE,
+                   msg_length,
+                   response_length);
+  }
+
+  void GenEncoder::BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    if (this->curr.execWidth == 8) {
+      msg_length = 1;
+      response_length = 1;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2;
+      response_length = 2;
+    } else
+      NOT_IMPLEMENTED;
+
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setDPByteScatterGather(this,
+                           insn,
+                           bti,
+                           elemSize,
+                           GEN7_BYTE_GATHER,
+                           msg_length,
+                           response_length);
+  }
+
+  void GenEncoder::BYTE_SCATTER(GenRegister msg, uint32_t bti, uint32_t elemSize) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    this->setHeader(insn);
+    if (this->curr.execWidth == 8) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+      msg_length = 2;
+    } else if (this->curr.execWidth == 16) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+      msg_length = 4;
+    } else
+      NOT_IMPLEMENTED;
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setDPByteScatterGather(this,
+                           insn,
+                           bti,
+                           elemSize,
+                           GEN7_BYTE_SCATTER,
+                           msg_length,
+                           response_length);
+  }
+
+  void GenEncoder::DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    uint32_t block_size = 0;
+    if (this->curr.execWidth == 8) {
+      msg_length = 1;
+      response_length = 1;
+      block_size = GEN_DWORD_SCATTER_8_DWORDS;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2;
+      response_length = 2;
+      block_size = GEN_DWORD_SCATTER_16_DWORDS;
+    } else
+      NOT_IMPLEMENTED;
+
+    this->setHeader(insn);
+    this->setDst(insn, dst);
+    this->setSrc0(insn, src);
+    this->setSrc1(insn, GenRegister::immud(0));
+    setDWordScatterMessgae(this,
+                           insn,
+                           bti,
+                           block_size,
+                           GEN7_DWORD_GATHER,
+                           msg_length,
+                           response_length);
+
+  }
+
+  void GenEncoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+
+    if (this->curr.execWidth == 8) {
+      msg_length = srcNum;
+      response_length = 1;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2*srcNum;
+      response_length = 2;
+    } else
+      NOT_IMPLEMENTED;
+
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
+    insn->bits3.gen7_atomic_op.msg_type = GEN7_UNTYPED_ATOMIC_READ;
+    insn->bits3.gen7_atomic_op.bti = bti;
+    insn->bits3.gen7_atomic_op.return_data = 1;
+    insn->bits3.gen7_atomic_op.aop_type = function;
+
+    if (this->curr.execWidth == 8)
+      insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD8;
+    else if (this->curr.execWidth == 16)
+      insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
+    else
+      NOT_SUPPORTED;
+
+  }
+  GenCompactInstruction *GenEncoder::nextCompact(uint32_t opcode) {
+    GenCompactInstruction insn;
+    std::memset(&insn, 0, sizeof(GenCompactInstruction));
+    insn.bits1.opcode = opcode;
+    this->store.push_back(insn.low);
+    return (GenCompactInstruction *)&this->store.back();
+  }
+
+  GenNativeInstruction *GenEncoder::next(uint32_t opcode) {
+     GenNativeInstruction insn;
+     std::memset(&insn, 0, sizeof(GenNativeInstruction));
+     insn.header.opcode = opcode;
+     this->store.push_back(insn.low);
+     this->store.push_back(insn.high);
+     return (GenNativeInstruction *)(&this->store.back()-1);
+  }
+
+  INLINE void _handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
+                            GenRegister src0, GenRegister src1 = GenRegister::null()) {
+       int w = p->curr.execWidth;
+       p->push();
+       p->curr.execWidth = p->getDoubleExecWidth();
+       p->curr.nibControl = 0;
+       GenNativeInstruction *insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, dst);
+       p->setSrc0(insn, src0);
+       if (!GenRegister::isNull(src1))
+         p->setSrc1(insn, src1);
+       if (w == 8)
+         p->curr.nibControl = 1; // second 1/8 mask
+       insn = p->next(opcode);
+       p->setHeader(insn);
+       p->setDst(insn, GenRegister::suboffset(dst, w / 2));
+       p->setSrc0(insn, GenRegister::suboffset(src0, w / 2));
+       if (!GenRegister::isNull(src1))
+         p->setSrc1(insn, GenRegister::suboffset(src1, w / 2));
+       p->pop();
+  }
+
+  // Double register accessing is a little special,
+  // Per Gen spec, then only supported mode is SIMD8 and, it only
+  // handles four doubles each time.
+  // We need to lower down SIMD16 to two SIMD8 and lower down SIMD8
+  // to two SIMD1x4.
+  INLINE void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister dst,
+                           GenRegister src0, GenRegister src1 = GenRegister::null()) {
+      if (p->curr.execWidth == 8)
+        _handleDouble(p, opcode, dst, src0, src1);
+      else if (p->curr.execWidth == 16) {
+        p->push();
+        p->curr.execWidth = 8;
+        p->curr.quarterControl = GEN_COMPRESSION_Q1;
+        _handleDouble(p, opcode, dst, src0, src1);
+        p->curr.quarterControl = GEN_COMPRESSION_Q2;
+        if (!GenRegister::isNull(src1))
+          src1 = GenRegister::offset(src1, 2);
+        _handleDouble(p, opcode, GenRegister::offset(dst, 2), GenRegister::offset(src0, 2), src1);
+        p->pop();
+      }
+  }
+
+  void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst,
+            GenRegister src, uint32_t condition) {
+     if (dst.isdf() && src.isdf()) {
+       handleDouble(p, opcode, dst, src);
+     } else if (dst.isint64() && src.isint64()) { // handle int64
+       p->MOV(dst.bottom_half(), src.bottom_half());
+       p->MOV(dst.top_half(p->simdWidth), src.top_half(p->simdWidth));
+     } else if (needToSplitAlu1(p, dst, src) == false) {
+      if(compactAlu1(p, opcode, dst, src, condition, false))
+        return;
+       GenNativeInstruction *insn = p->next(opcode);
+       if (condition != 0) {
+         GBE_ASSERT(opcode == GEN_OPCODE_MOV ||
+                    opcode == GEN_OPCODE_NOT);
+         insn->header.destreg_or_condmod = condition;
+       }
+       p->setHeader(insn);
+       p->setDst(insn, dst);
+       p->setSrc0(insn, src);
+     } else {
+       GenNativeInstruction *insnQ1, *insnQ2;
+
+       // Instruction for the first quarter
+       insnQ1 = p->next(opcode);
+       p->setHeader(insnQ1);
+       insnQ1->header.quarter_control = GEN_COMPRESSION_Q1;
+       insnQ1->header.execution_size = GEN_WIDTH_8;
+       p->setDst(insnQ1, dst);
+       p->setSrc0(insnQ1, src);
+
+       // Instruction for the second quarter
+       insnQ2 = p->next(opcode);
+       p->setHeader(insnQ2);
+       insnQ2->header.quarter_control = GEN_COMPRESSION_Q2;
+       insnQ2->header.execution_size = GEN_WIDTH_8;
+       p->setDst(insnQ2, GenRegister::Qn(dst, 1));
+       p->setSrc0(insnQ2, GenRegister::Qn(src, 1));
+     }
+  }
+
+  void alu2(GenEncoder *p,
+            uint32_t opcode,
+            GenRegister dst,
+            GenRegister src0,
+            GenRegister src1,
+            uint32_t condition)
+  {
+    if (dst.isdf() && src0.isdf() && src1.isdf()) {
+       handleDouble(p, opcode, dst, src0, src1);
+    } else if (needToSplitAlu2(p, dst, src0, src1) == false) {
+       if(compactAlu2(p, opcode, dst, src0, src1, condition, false))
+         return;
+       GenNativeInstruction *insn = p->next(opcode);
+       if (condition != 0) {
+         GBE_ASSERT(opcode == GEN_OPCODE_OR ||
+                    opcode == GEN_OPCODE_XOR ||
+                    opcode == GEN_OPCODE_AND);
+         insn->header.destreg_or_condmod = condition;
+       }
+       p->setHeader(insn);
+       p->setDst(insn, dst);
+       p->setSrc0(insn, src0);
+       p->setSrc1(insn, src1);
+    } else {
+       GenNativeInstruction *insnQ1, *insnQ2;
+
+       // Instruction for the first quarter
+       insnQ1 = p->next(opcode);
+       p->setHeader(insnQ1);
+       insnQ1->header.quarter_control = GEN_COMPRESSION_Q1;
+       insnQ1->header.execution_size = GEN_WIDTH_8;
+       p->setDst(insnQ1, dst);
+       p->setSrc0(insnQ1, src0);
+       p->setSrc1(insnQ1, src1);
+
+       // Instruction for the second quarter
+       insnQ2 = p->next(opcode);
+       p->setHeader(insnQ2);
+       insnQ2->header.quarter_control = GEN_COMPRESSION_Q2;
+       insnQ2->header.execution_size = GEN_WIDTH_8;
+       p->setDst(insnQ2, GenRegister::Qn(dst, 1));
+       p->setSrc0(insnQ2, GenRegister::Qn(src0, 1));
+       p->setSrc1(insnQ2, GenRegister::Qn(src1, 1));
+    }
+  }
+
+#define NO_SWIZZLE ((0<<0) | (1<<2) | (2<<4) | (3<<6))
+
+  static GenNativeInstruction *alu3(GenEncoder *p,
+                              uint32_t opcode,
+                              GenRegister dest,
+                              GenRegister src0,
+                              GenRegister src1,
+                              GenRegister src2)
+  {
+     GenNativeInstruction *insn = p->next(opcode);
+
+     assert(dest.file == GEN_GENERAL_REGISTER_FILE);
+     assert(dest.nr < 128);
+     assert(dest.address_mode == GEN_ADDRESS_DIRECT);
+     assert(dest.type = GEN_TYPE_F);
+     insn->bits1.da3src.dest_reg_file = 0;
+     insn->bits1.da3src.dest_reg_nr = dest.nr;
+     insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
+     insn->bits1.da3src.dest_writemask = 0xf;
+     p->setHeader(insn);
+     insn->header.access_mode = GEN_ALIGN_16;
+     insn->header.execution_size = GEN_WIDTH_8;
+
+     assert(src0.file == GEN_GENERAL_REGISTER_FILE);
+     assert(src0.address_mode == GEN_ADDRESS_DIRECT);
+     assert(src0.nr < 128);
+     assert(src0.type == GEN_TYPE_F);
+     insn->bits2.da3src.src0_swizzle = NO_SWIZZLE;
+     insn->bits2.da3src.src0_subreg_nr = src0.subnr / 4 ;
+     insn->bits2.da3src.src0_reg_nr = src0.nr;
+     insn->bits1.da3src.src0_abs = src0.absolute;
+     insn->bits1.da3src.src0_negate = src0.negation;
+     insn->bits2.da3src.src0_rep_ctrl = src0.vstride == GEN_VERTICAL_STRIDE_0;
+
+     assert(src1.file == GEN_GENERAL_REGISTER_FILE);
+     assert(src1.address_mode == GEN_ADDRESS_DIRECT);
+     assert(src1.nr < 128);
+     assert(src1.type == GEN_TYPE_F);
+     insn->bits2.da3src.src1_swizzle = NO_SWIZZLE;
+     insn->bits2.da3src.src1_subreg_nr_low = (src1.subnr / 4) & 0x3;
+     insn->bits3.da3src.src1_subreg_nr_high = (src1.subnr / 4) >> 2;
+     insn->bits2.da3src.src1_rep_ctrl = src1.vstride == GEN_VERTICAL_STRIDE_0;
+     insn->bits3.da3src.src1_reg_nr = src1.nr;
+     insn->bits1.da3src.src1_abs = src1.absolute;
+     insn->bits1.da3src.src1_negate = src1.negation;
+
+     assert(src2.file == GEN_GENERAL_REGISTER_FILE);
+     assert(src2.address_mode == GEN_ADDRESS_DIRECT);
+     assert(src2.nr < 128);
+     assert(src2.type == GEN_TYPE_F);
+     insn->bits3.da3src.src2_swizzle = NO_SWIZZLE;
+     insn->bits3.da3src.src2_subreg_nr = src2.subnr / 4;
+     insn->bits3.da3src.src2_rep_ctrl = src2.vstride == GEN_VERTICAL_STRIDE_0;
+     insn->bits3.da3src.src2_reg_nr = src2.nr;
+     insn->bits1.da3src.src2_abs = src2.absolute;
+     insn->bits1.da3src.src2_negate = src2.negation;
+
+     // Emit second half of the instruction
+     if (p->curr.execWidth == 16) {
+      GenNativeInstruction q1Insn = *insn;
+      insn = p->next(opcode);
+      *insn = q1Insn;
+      insn->header.quarter_control = GEN_COMPRESSION_Q2;
+      insn->bits1.da3src.dest_reg_nr++;
+      if (insn->bits2.da3src.src0_rep_ctrl == 0)
+        insn->bits2.da3src.src0_reg_nr++;
+      if (insn->bits2.da3src.src1_rep_ctrl == 0)
+        insn->bits3.da3src.src1_reg_nr++;
+      if (insn->bits3.da3src.src2_rep_ctrl == 0)
+        insn->bits3.da3src.src2_reg_nr++;
+     }
+
+     return insn;
+  }
+
+#undef NO_SWIZZLE
+
+#define ALU1(OP) \
+  void GenEncoder::OP(GenRegister dest, GenRegister src0, uint32_t condition) { \
+    alu1(this, GEN_OPCODE_##OP, dest, src0, condition); \
+  }
+
+#define ALU2(OP) \
+  void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1) { \
+    alu2(this, GEN_OPCODE_##OP, dest, src0, src1, 0); \
+  }
+
+#define ALU2_MOD(OP) \
+  void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1, uint32_t condition) { \
+    alu2(this, GEN_OPCODE_##OP, dest, src0, src1, condition); \
+  }
+
+
+#define ALU3(OP) \
+  void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1, GenRegister src2) { \
+    alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
+  }
+
+  void GenEncoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
+    union { double d; unsigned u[2]; } u;
+    u.d = value;
+    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
+    push();
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    curr.execWidth = 1;
+    MOV(r, GenRegister::immud(u.u[1]));
+    MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[0]));
+    pop();
+    r.type = GEN_TYPE_DF;
+    r.vstride = GEN_VERTICAL_STRIDE_0;
+    r.width = GEN_WIDTH_1;
+    r.hstride = GEN_HORIZONTAL_STRIDE_0;
+    push();
+    uint32_t width = curr.execWidth;
+    curr.execWidth = 8;
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    curr.quarterControl = GEN_COMPRESSION_Q1;
+    MOV(dest, r);
+    if (width == 16) {
+      curr.quarterControl = GEN_COMPRESSION_Q2;
+      MOV(GenRegister::offset(dest, 2), r);
+    }
+    pop();
+  }
+
+  void GenEncoder::UPSAMPLE_SHORT(GenRegister dest, GenRegister src0, GenRegister src1) {
+    dest.type = GEN_TYPE_B;
+    dest.hstride = GEN_HORIZONTAL_STRIDE_2;
+    src0.type = GEN_TYPE_B;
+    src0.hstride = GEN_HORIZONTAL_STRIDE_2;
+    src1.type = GEN_TYPE_B;
+    src1.hstride = GEN_HORIZONTAL_STRIDE_2;
+    MOV(dest, src1);
+    dest.subnr ++;
+    MOV(dest, src0);
+  }
+
+  void GenEncoder::UPSAMPLE_INT(GenRegister dest, GenRegister src0, GenRegister src1) {
+    dest.type = GEN_TYPE_W;
+    dest.hstride = GEN_HORIZONTAL_STRIDE_2;
+    src0.type = GEN_TYPE_W;
+    src0.hstride = GEN_HORIZONTAL_STRIDE_2;
+    src1.type = GEN_TYPE_W;
+    src1.hstride = GEN_HORIZONTAL_STRIDE_2;
+    MOV(dest, src1);
+    dest.subnr += 2;
+    MOV(dest, src0);
+  }
+
+  void GenEncoder::LOAD_INT64_IMM(GenRegister dest, int64_t value) {
+    GenRegister u0 = GenRegister::immd((int)value), u1 = GenRegister::immd(value >> 32);
+    MOV(dest.bottom_half(), u0);
+    MOV(dest.top_half(this->simdWidth), u1);
+  }
+
+  void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
+    GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && dest.type == GEN_TYPE_F));
+    int w = curr.execWidth;
+    GenRegister r0;
+    int factor = 1;
+    if (dest.type == GEN_TYPE_F) {
+      r0 = r;
+      r = GenRegister::h2(r);
+      factor = 2;
+    } else {
+      r0 = GenRegister::h2(r);
+    }
+    push();
+    curr.execWidth = 8;
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    MOV(r0, src0);
+    MOV(GenRegister::suboffset(r0, 4 * factor), GenRegister::suboffset(src0, 4));
+    curr.noMask = 0;
+    curr.quarterControl = 0;
+    curr.nibControl = 0;
+    MOV(dest, r);
+    curr.nibControl = 1;
+    MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r, 8 / factor));
+    pop();
+    if (w == 16) {
+      push();
+      curr.execWidth = 8;
+      curr.predicate = GEN_PREDICATE_NONE;
+      curr.noMask = 1;
+      MOV(r0, GenRegister::suboffset(src0, 8));
+      MOV(GenRegister::suboffset(r0, 4 * factor), GenRegister::suboffset(src0, 12));
+      curr.noMask = 0;
+      curr.quarterControl = 1;
+      curr.nibControl = 0;
+      MOV(GenRegister::suboffset(dest, 8), r);
+      curr.nibControl = 1;
+      MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r, 8 / factor));
+      pop();
+    }
+  }
+
+  ALU1(MOV)
+  ALU1(RNDZ)
+  ALU1(RNDE)
+  ALU1(RNDD)
+  ALU1(RNDU)
+  ALU1(FBH)
+  ALU1(FBL)
+  ALU1(F16TO32)
+  ALU1(F32TO16)
+  ALU2(SEL)
+  ALU1(NOT)
+  ALU2_MOD(AND)
+  ALU2_MOD(OR)
+  ALU2_MOD(XOR)
+  ALU2(SHR)
+  ALU2(SHL)
+  ALU2(RSR)
+  ALU2(RSL)
+  ALU2(ASR)
+  ALU1(FRC)
+  ALU2(MAC)
+  ALU1(LZD)
+  ALU2(LINE)
+  ALU2(PLN)
+  ALU2(MACH)
+  ALU3(MAD)
+ // ALU2(BRC)
+ // ALU1(ENDIF)
+ //  ALU1(IF)
+
+  void GenEncoder::SUBB(GenRegister dest, GenRegister src0, GenRegister src1) {
+    push();
+    curr.accWrEnable = 1;
+    alu2(this, GEN_OPCODE_SUBB, dest, src0, src1);
+    pop();
+  }
+
+  void GenEncoder::ADDC(GenRegister dest, GenRegister src0, GenRegister src1) {
+    push();
+    curr.accWrEnable = 1;
+    alu2(this, GEN_OPCODE_ADDC, dest, src0, src1);
+    pop();
+  }
+
+  void GenEncoder::ADD(GenRegister dest, GenRegister src0, GenRegister src1) {
+     if (src0.type == GEN_TYPE_F ||
+         (src0.file == GEN_IMMEDIATE_VALUE &&
+          src0.type == GEN_TYPE_VF)) {
+        assert(src1.type != GEN_TYPE_UD);
+        assert(src1.type != GEN_TYPE_D);
+     }
+
+     if (src1.type == GEN_TYPE_F ||
+         (src1.file == GEN_IMMEDIATE_VALUE &&
+          src1.type == GEN_TYPE_VF)) {
+        assert(src0.type != GEN_TYPE_UD);
+        assert(src0.type != GEN_TYPE_D);
+     }
+
+     alu2(this, GEN_OPCODE_ADD, dest, src0, src1);
+  }
+
+  void GenEncoder::MUL(GenRegister dest, GenRegister src0, GenRegister src1) {
+     if (src0.type == GEN_TYPE_D ||
+         src0.type == GEN_TYPE_UD ||
+         src1.type == GEN_TYPE_D ||
+         src1.type == GEN_TYPE_UD)
+        assert(dest.type != GEN_TYPE_F);
+
+     if (src0.type == GEN_TYPE_F ||
+         (src0.file == GEN_IMMEDIATE_VALUE &&
+          src0.type == GEN_TYPE_VF)) {
+        assert(src1.type != GEN_TYPE_UD);
+        assert(src1.type != GEN_TYPE_D);
+     }
+
+     if (src1.type == GEN_TYPE_F ||
+         (src1.file == GEN_IMMEDIATE_VALUE &&
+          src1.type == GEN_TYPE_VF)) {
+        assert(src0.type != GEN_TYPE_UD);
+        assert(src0.type != GEN_TYPE_D);
+     }
+
+     assert(src0.file != GEN_ARCHITECTURE_REGISTER_FILE ||
+            src0.nr != GEN_ARF_ACCUMULATOR);
+     assert(src1.file != GEN_ARCHITECTURE_REGISTER_FILE ||
+            src1.nr != GEN_ARF_ACCUMULATOR);
+
+     alu2(this, GEN_OPCODE_MUL, dest, src0, src1);
+  }
+
+
+  void GenEncoder::NOP(void) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_NOP);
+    this->setDst(insn, GenRegister::retype(GenRegister::f4grf(0,0), GEN_TYPE_UD));
+    this->setSrc0(insn, GenRegister::retype(GenRegister::f4grf(0,0), GEN_TYPE_UD));
+    this->setSrc1(insn, GenRegister::immud(0x0));
+  }
+
+  void GenEncoder::BARRIER(GenRegister src) {
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+     this->setHeader(insn);
+     this->setDst(insn, GenRegister::null());
+     this->setSrc0(insn, src);
+     setMessageDescriptor(insn, GEN_SFID_MESSAGE_GATEWAY, 1, 0);
+     insn->bits3.msg_gateway.sub_function_id = GEN_BARRIER_MSG;
+     insn->bits3.msg_gateway.notify = 0x1;
+  }
+  void GenEncoder::FENCE(GenRegister dst) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    this->setHeader(insn);
+    this->setDst(insn, dst);
+    this->setSrc0(insn, dst);
+    setMessageDescriptor(insn, GEN_SFID_DATAPORT_DATA_CACHE, 1, 1, 1);
+    insn->bits3.gen7_memory_fence.msg_type = GEN_MEM_FENCE;
+    insn->bits3.gen7_memory_fence.commit_enable = 0x1;
+  }
+
+  void GenEncoder::JMPI(GenRegister src, bool longjmp) {
+    alu2(this, GEN_OPCODE_JMPI, GenRegister::ip(), GenRegister::ip(), src);
+    if (longjmp)
+      NOP();
+  }
+
+#define ALU2_BRA(OP) \
+  void GenEncoder::OP(GenRegister src) { \
+    alu2(this, GEN_OPCODE_##OP, GenRegister::nullud(), GenRegister::nullud(), src); \
+  }
+
+  ALU2_BRA(IF)
+  ALU2_BRA(ENDIF)
+  ALU2_BRA(BRD)
+  ALU2_BRA(BRC)
+
+  void GenEncoder::patchJMPI(uint32_t insnID, int32_t jumpDistance) {
+    GenNativeInstruction &insn = *(GenNativeInstruction *)&this->store[insnID];
+    GBE_ASSERT(insnID < this->store.size());
+    GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
+               insn.header.opcode == GEN_OPCODE_BRD  ||
+               insn.header.opcode == GEN_OPCODE_ENDIF ||
+               insn.header.opcode == GEN_OPCODE_IF ||
+               insn.header.opcode == GEN_OPCODE_BRC);
+
+    if (insn.header.opcode != GEN_OPCODE_JMPI || (jumpDistance > -32769 && jumpDistance < 32768))  {
+           if (insn.header.opcode == GEN_OPCODE_IF) {
+             this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+             return;
+           }
+           else if (insn.header.opcode == GEN_OPCODE_JMPI) {
+             jumpDistance = jumpDistance - 2;
+           }
+
+           this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+    } else if ( insn.header.predicate_control == GEN_PREDICATE_NONE ) {
+      // For the conditional jump distance out of S15 range, we need to use an
+      // inverted jmp followed by a add ip, ip, distance to implement.
+      // A little hacky as we need to change the nop instruction to add
+      // instruction manually.
+      // If this is a unconditional jump, we just need to add the IP directly.
+      // FIXME there is an optimization method which we can insert a
+      // ADD instruction on demand. But that will need some extra analysis
+      // for all the branching instruction. And need to adjust the distance
+      // for those branch instruction's start point and end point contains
+      // this instruction.
+      GenNativeInstruction *insn2 = (GenNativeInstruction *)&this->store[insnID+2];
+      GBE_ASSERT(insn2->header.opcode == GEN_OPCODE_NOP);
+      insn2 = insn2;
+      insn.header.opcode = GEN_OPCODE_ADD;
+      this->setDst(&insn, GenRegister::ip());
+      this->setSrc0(&insn, GenRegister::ip());
+      this->setSrc1(&insn, GenRegister::immd(jumpDistance * 8));
+    } else {
+      GenNativeInstruction &insn2 = *(GenNativeInstruction *)&this->store[insnID+2];
+      insn.header.predicate_inverse ^= 1;
+      this->setSrc1(&insn, GenRegister::immd(2));
+      GBE_ASSERT(insn2.header.opcode == GEN_OPCODE_NOP);
+      GBE_ASSERT(insnID < this->store.size());
+      insn2.header.predicate_control = GEN_PREDICATE_NONE;
+      insn2.header.opcode = GEN_OPCODE_ADD;
+      this->setDst(&insn2, GenRegister::ip());
+      this->setSrc0(&insn2, GenRegister::ip());
+      this->setSrc1(&insn2, GenRegister::immd((jumpDistance - 2) * 8));
+    }
+  }
+
+  void GenEncoder::CMP(uint32_t conditional, GenRegister src0, GenRegister src1, GenRegister dst) {
+    if (needToSplitCmp(this, src0, src1) == false) {
+      if(!GenRegister::isNull(dst) && compactAlu2(this, GEN_OPCODE_CMP, dst, src0, src1, conditional, false)) {
+        return;
+      }
+      GenNativeInstruction *insn = this->next(GEN_OPCODE_CMP);
+      this->setHeader(insn);
+      insn->header.destreg_or_condmod = conditional;
+      if (GenRegister::isNull(dst))
+        insn->header.thread_control = GEN_THREAD_SWITCH;
+      this->setDst(insn, dst);
+      this->setSrc0(insn, src0);
+      this->setSrc1(insn, src1);
+    } else {
+      GenNativeInstruction *insnQ1, *insnQ2;
+
+      // Instruction for the first quarter
+      insnQ1 = this->next(GEN_OPCODE_CMP);
+      this->setHeader(insnQ1);
+      if (GenRegister::isNull(dst))
+        insnQ1->header.thread_control = GEN_THREAD_SWITCH;
+      insnQ1->header.quarter_control = GEN_COMPRESSION_Q1;
+      insnQ1->header.execution_size = GEN_WIDTH_8;
+      insnQ1->header.destreg_or_condmod = conditional;
+      this->setDst(insnQ1, dst);
+      this->setSrc0(insnQ1, src0);
+      this->setSrc1(insnQ1, src1);
+
+      // Instruction for the second quarter
+      insnQ2 = this->next(GEN_OPCODE_CMP);
+      this->setHeader(insnQ2);
+      if (GenRegister::isNull(dst))
+        insnQ2->header.thread_control = GEN_THREAD_SWITCH;
+      insnQ2->header.quarter_control = GEN_COMPRESSION_Q2;
+      insnQ2->header.execution_size = GEN_WIDTH_8;
+      insnQ2->header.destreg_or_condmod = conditional;
+      this->setDst(insnQ2, GenRegister::Qn(dst, 1));
+      this->setSrc0(insnQ2, GenRegister::Qn(src0, 1));
+      this->setSrc1(insnQ2, GenRegister::Qn(src1, 1));
+    }
+  }
+
+  void GenEncoder::SEL_CMP(uint32_t conditional,
+                           GenRegister dst,
+                           GenRegister src0,
+                           GenRegister src1)
+  {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEL);
+    GBE_ASSERT(curr.predicate == GEN_PREDICATE_NONE);
+    this->setHeader(insn);
+    insn->header.destreg_or_condmod = conditional;
+    this->setDst(insn, dst);
+    this->setSrc0(insn, src0);
+    this->setSrc1(insn, src1);
+  }
+
+  void GenEncoder::WAIT(void) {
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_WAIT);
+     GenRegister src = GenRegister::notification1();
+     this->setDst(insn, GenRegister::null());
+     this->setSrc0(insn, src);
+     this->setSrc1(insn, GenRegister::null());
+     insn->header.execution_size = 0; /* must */
+     insn->header.predicate_control = 0;
+     insn->header.quarter_control = 0;
+  }
+
+  void GenEncoder::MATH(GenRegister dst, uint32_t function, GenRegister src0, GenRegister src1) {
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_MATH);
+     assert(dst.file == GEN_GENERAL_REGISTER_FILE);
+     assert(src0.file == GEN_GENERAL_REGISTER_FILE);
+     assert(src1.file == GEN_GENERAL_REGISTER_FILE);
+     assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1 || dst.hstride == GEN_HORIZONTAL_STRIDE_0);
+
+     if (function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT ||
+         function == GEN_MATH_FUNCTION_INT_DIV_REMAINDER ||
+         function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
+        assert(src0.type != GEN_TYPE_F);
+        assert(src1.type != GEN_TYPE_F);
+     } else {
+        assert(src0.type == GEN_TYPE_F);
+        assert(src1.type == GEN_TYPE_F);
+     }
+
+     insn->header.destreg_or_condmod = function;
+     this->setHeader(insn);
+     this->setDst(insn, dst);
+     this->setSrc0(insn, src0);
+     this->setSrc1(insn, src1);
+
+     if (function == GEN_MATH_FUNCTION_INT_DIV_QUOTIENT ||
+         function == GEN_MATH_FUNCTION_INT_DIV_REMAINDER) {
+        insn->header.execution_size = this->curr.execWidth == 1 ? GEN_WIDTH_1 : GEN_WIDTH_8;
+        insn->header.quarter_control = GEN_COMPRESSION_Q1;
+
+        if(this->curr.execWidth == 16) {
+          GenNativeInstruction *insn2 = this->next(GEN_OPCODE_MATH);
+          GenRegister new_dest, new_src0, new_src1;
+          new_dest = GenRegister::QnPhysical(dst, 1);
+          new_src0 = GenRegister::QnPhysical(src0, 1);
+          new_src1 = GenRegister::QnPhysical(src1, 1);
+          insn2->header.destreg_or_condmod = function;
+          this->setHeader(insn2);
+          insn2->header.execution_size = GEN_WIDTH_8;
+          insn2->header.quarter_control = GEN_COMPRESSION_Q2;
+          this->setDst(insn2, new_dest);
+          this->setSrc0(insn2, new_src0);
+          this->setSrc1(insn2, new_src1);
+        }
+
+     }
+  }
+
+  void GenEncoder::MATH(GenRegister dst, uint32_t function, GenRegister src) {
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_MATH);
+     assert(dst.file == GEN_GENERAL_REGISTER_FILE);
+     assert(src.file == GEN_GENERAL_REGISTER_FILE);
+     assert(dst.hstride == GEN_HORIZONTAL_STRIDE_1 || dst.hstride == GEN_HORIZONTAL_STRIDE_0);
+     assert(src.type == GEN_TYPE_F);
+
+     insn->header.destreg_or_condmod = function;
+     this->setHeader(insn);
+     this->setDst(insn, dst);
+     this->setSrc0(insn, src);
+  }
+
+  void GenEncoder::SAMPLE(GenRegister dest,
+                          GenRegister msg,
+                          unsigned int msg_len,
+                          bool header_present,
+                          unsigned char bti,
+                          unsigned char sampler,
+                          uint32_t simdWidth,
+                          uint32_t writemask,
+                          uint32_t return_format,
+                          bool isLD,
+                          bool isUniform)
+  {
+     if (writemask == 0) return;
+     uint32_t msg_type = isLD ? GEN_SAMPLER_MESSAGE_SIMD8_LD :
+                                GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+     uint32_t response_length = (4 * (simdWidth / 8));
+     uint32_t msg_length = (msg_len * (simdWidth / 8));
+     if (header_present)
+       msg_length++;
+     uint32_t simd_mode = (simdWidth == 16) ?
+                            GEN_SAMPLER_SIMD_MODE_SIMD16 : GEN_SAMPLER_SIMD_MODE_SIMD8;
+    if(isUniform) {
+      response_length = 1;
+      msg_type = GEN_SAMPLER_MESSAGE_SIMD4X2_LD;
+      msg_length = 1;
+      simd_mode = GEN_SAMPLER_SIMD_MODE_SIMD4X2;
+    }
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+     this->setHeader(insn);
+     this->setDst(insn, dest);
+     this->setSrc0(insn, msg);
+     setSamplerMessage(this, insn, bti, sampler, msg_type,
+                       response_length, msg_length,
+                       header_present,
+                       simd_mode, return_format);
+  }
+
+  void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned char bti)
+  {
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+     uint32_t msg_type = GEN_TYPED_WRITE;
+     uint32_t msg_length = header_present ? 9 : 8;
+     this->setHeader(insn);
+     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+     this->setSrc0(insn, msg);
+     setTypedWriteMessage(insn, bti, msg_type, msg_length, header_present);
+  }
+  static void setScratchMessage(GenEncoder *p,
+                                   GenNativeInstruction *insn,
+                                   uint32_t offset,
+                                   uint32_t block_size,
+                                   uint32_t channel_mode,
+                                   uint32_t msg_type,
+                                   uint32_t msg_length,
+                                   uint32_t response_length)
+  {
+     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+     p->setMessageDescriptor(insn, sfid, msg_length, response_length, true);
+     insn->bits3.gen7_scratch_rw.block_size = block_size;
+     insn->bits3.gen7_scratch_rw.msg_type = msg_type;
+     insn->bits3.gen7_scratch_rw.channel_mode = channel_mode;
+     insn->bits3.gen7_scratch_rw.offset = offset;
+     insn->bits3.gen7_scratch_rw.category = 1;
+  }
+
+  void GenEncoder::SCRATCH_WRITE(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num, uint32_t channel_mode)
+  {
+     assert(src_num == 1 || src_num ==2);
+     uint32_t block_size = src_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2;
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+     this->setHeader(insn);
+     this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+     this->setSrc0(insn, msg);
+     this->setSrc1(insn, GenRegister::immud(0));
+     // here src_num means register that will be write out: in terms of 32byte register number
+     setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_WRITE, src_num+1, 0);
+  }
+
+  void GenEncoder::SCRATCH_READ(GenRegister dst, GenRegister src, uint32_t offset, uint32_t size, uint32_t dst_num, uint32_t channel_mode)
+  {
+     assert(dst_num == 1 || dst_num ==2);
+     uint32_t block_size = dst_num == 1 ? GEN_SCRATCH_BLOCK_SIZE_1 : GEN_SCRATCH_BLOCK_SIZE_2;
+     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+     this->setHeader(insn);
+     this->setDst(insn, dst);
+     this->setSrc0(insn, src);
+     this->setSrc1(insn, GenRegister::immud(0));
+      // here dst_num is the register that will be write-back: in terms of 32byte register
+     setScratchMessage(this, insn, offset, block_size, channel_mode, GEN_SCRATCH_READ, 1, dst_num);
+  }
+
+  void GenEncoder::EOT(uint32_t msg) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+    this->setSrc0(insn, GenRegister::ud8grf(msg,0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    insn->header.execution_size = GEN_WIDTH_8;
+    insn->bits3.spawner_gen5.resource = GEN_DO_NOT_DEREFERENCE_URB;
+    insn->bits3.spawner_gen5.msg_length = 1;
+    insn->bits3.spawner_gen5.end_of_thread = 1;
+    insn->header.destreg_or_condmod = GEN_SFID_THREAD_SPAWNER;
+  }
+} /* namespace gbe */
+
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
new file mode 100644
index 0000000..d6e2b97
--- /dev/null
+++ b/backend/src/backend/gen_encoder.hpp
@@ -0,0 +1,241 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith at tungstengraphics.com>
+  */
+
+#ifndef __GBE_GEN_ENCODER_HPP__
+#define __GBE_GEN_ENCODER_HPP__
+
+#include "backend/gen_defs.hpp"
+#include "backend/gen_register.hpp"
+#include "sys/platform.hpp"
+#include "sys/vector.hpp"
+#include <cassert>
+#include "src/cl_device_data.h"
+
+namespace gbe
+{
+  /*! Helper structure to emit Gen instructions */
+  class GenEncoder
+  {
+  public:
+    /*! simdWidth is the default width for the instructions */
+    GenEncoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID);
+
+    virtual ~GenEncoder(void) { }
+    /*! Size of the stack (should be large enough) */
+    enum { MAX_STATE_NUM = 16 };
+    /*! gen7 exec width of the double data type */
+    #define GEN7_DOUBLE_EXEC_WIDTH  8
+    /*! Push the current instruction state */
+    void push(void);
+    /*! Pop the latest pushed state */
+    void pop(void);
+    /*! The instruction stream we are building */
+    vector<GenInstruction> store;
+    /*! Current instruction state to use */
+    GenInstructionState curr;
+    /*! State used to encode the instructions */
+    GenInstructionState stack[MAX_STATE_NUM];
+    /*! Number of states currently pushed */
+    uint32_t stateNum;
+    /*! Gen generation to encode */
+    uint32_t gen;
+    /*! Device ID */
+    uint32_t deviceID;
+    /*! simd width for this codegen */
+    uint32_t simdWidth;
+    ////////////////////////////////////////////////////////////////////////
+    // Encoding functions
+    ////////////////////////////////////////////////////////////////////////
+
+#define ALU1(OP) void OP(GenRegister dest, GenRegister src0, uint32_t condition = 0);
+#define ALU2(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1);
+#define ALU2_MOD(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1, uint32_t condition = 0);
+#define ALU3(OP) void OP(GenRegister dest, GenRegister src0, GenRegister src1, GenRegister src2);
+    ALU1(MOV)
+    ALU1(FBH)
+    ALU1(FBL)
+    ALU2(SUBB)
+    ALU2(UPSAMPLE_SHORT)
+    ALU2(UPSAMPLE_INT)
+    ALU1(RNDZ)
+    ALU1(RNDE)
+    ALU1(RNDD)
+    ALU1(RNDU)
+    ALU1(F16TO32)
+    ALU1(F32TO16)
+    ALU2(SEL)
+    ALU1(NOT)
+    ALU2_MOD(AND)
+    ALU2_MOD(OR)
+    ALU2_MOD(XOR)
+    ALU2(SHR)
+    ALU2(SHL)
+    ALU2(RSR)
+    ALU2(RSL)
+    ALU2(ASR)
+    ALU2(ADD)
+    ALU2(ADDC)
+    ALU2(MUL)
+    ALU1(FRC)
+    ALU2(MAC)
+    ALU2(MACH)
+    ALU1(LZD)
+    ALU2(LINE)
+    ALU2(PLN)
+    ALU3(MAD)
+    //ALU2(MOV_DF);
+    ALU2(BRC)
+    ALU1(BRD)
+#undef ALU1
+#undef ALU2
+#undef ALU2_MOD
+#undef ALU3
+    /*! Get double/long exec width */
+    virtual int getDoubleExecWidth(void) { return GEN7_DOUBLE_EXEC_WIDTH; }
+    virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
+    virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
+    void LOAD_INT64_IMM(GenRegister dest, int64_t value);
+    /*! Barrier message (to synchronize threads of a workgroup) */
+    void BARRIER(GenRegister src);
+    /*! Memory fence message (to order loads and stores between threads) */
+    void FENCE(GenRegister dst);
+    /*! Jump indexed instruction */
+    virtual void JMPI(GenRegister src, bool longjmp = false);
+    /*! IF indexed instruction */
+    void IF(GenRegister src);
+    /*! ENDIF indexed instruction */
+    void ENDIF(GenRegister src);
+    /*! BRC indexed instruction */
+    void BRC(GenRegister src);
+    /*! BRD indexed instruction */
+    void BRD(GenRegister src);
+    /*! Compare instructions */
+    void CMP(uint32_t conditional, GenRegister src0, GenRegister src1, GenRegister dst = GenRegister::null());
+    /*! Select with embedded compare (like sel.le ...) */
+    void SEL_CMP(uint32_t conditional, GenRegister dst, GenRegister src0, GenRegister src1);
+    /*! EOT is used to finish GPGPU threads */
+    void EOT(uint32_t msg_nr);
+    /*! No-op */
+    void NOP(void);
+    /*! Wait instruction (used for the barrier) */
+    void WAIT(void);
+    /*! Atomic instructions */
+    virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
+    /*! Untyped read (upto 4 channels) */
+    virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum);
+    /*! Untyped write (upto 4 channels) */
+    virtual void UNTYPED_WRITE(GenRegister src, uint32_t bti, uint32_t elemNum);
+    /*! Byte gather (for unaligned bytes, shorts and ints) */
+    void BYTE_GATHER(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemSize);
+    /*! Byte scatter (for unaligned bytes, shorts and ints) */
+    void BYTE_SCATTER(GenRegister src, uint32_t bti, uint32_t elemSize);
+    /*! DWord gather (for constant cache read) */
+    void DWORD_GATHER(GenRegister dst, GenRegister src, uint32_t bti);
+    /*! for scratch memory read */
+    void SCRATCH_READ(GenRegister msg, GenRegister dst, uint32_t offset, uint32_t size, uint32_t dst_num, uint32_t channel_mode);
+    /*! for scratch memory write */
+    void SCRATCH_WRITE(GenRegister msg, uint32_t offset, uint32_t size, uint32_t src_num, uint32_t channel_mode);
+    /*! Send instruction for the sampler */
+    void SAMPLE(GenRegister dest,
+                GenRegister msg,
+                unsigned int msg_len,
+                bool header_present,
+                unsigned char bti,
+                unsigned char sampler,
+                unsigned int simdWidth,
+                uint32_t writemask,
+                uint32_t return_format,
+                bool isLD,
+                bool isUniform);
+
+    /*! TypedWrite instruction for texture */
+    virtual void TYPED_WRITE(GenRegister header,
+                             bool header_present,
+                             unsigned char bti);
+    /*! Extended math function (2 sources) */
+    void MATH(GenRegister dst, uint32_t function, GenRegister src0, GenRegister src1);
+    /*! Extended math function (1 source) */
+    void MATH(GenRegister dst, uint32_t function, GenRegister src);
+
+    /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
+    virtual void patchJMPI(uint32_t insnID, int32_t jumpDistance);
+
+    ////////////////////////////////////////////////////////////////////////
+    // Helper functions to encode
+    ////////////////////////////////////////////////////////////////////////
+    virtual void setHeader(GenNativeInstruction *insn);
+    virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
+                                uint32_t msg_type, uint32_t msg_length,
+                                uint32_t response_length);
+    virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
+                                      unsigned char msg_type, uint32_t msg_length,
+                                      bool header_present);
+    void setMessageDescriptor(GenNativeInstruction *inst, enum GenMessageTarget sfid,
+                              unsigned msg_length, unsigned response_length,
+                              bool header_present = false, bool end_of_thread = false);
+    void setDst(GenNativeInstruction *insn, GenRegister dest);
+    void setSrc0(GenNativeInstruction *insn, GenRegister reg);
+    void setSrc1(GenNativeInstruction *insn, GenRegister reg);
+    GenCompactInstruction *nextCompact(uint32_t opcode);
+    GenNativeInstruction *next(uint32_t opcode);
+    uint32_t n_instruction(void) const { return store.size(); }
+    GBE_CLASS(GenEncoder); //!< Use custom allocators
+  };
+
+  void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst,
+            GenRegister src, uint32_t condition = 0);
+
+  void alu2(GenEncoder *p, uint32_t opcode, GenRegister dst,
+            GenRegister src0, GenRegister src1, uint32_t condition = 0);
+} /* namespace gbe */
+
+#endif /* __GBE_GEN_ENCODER_HPP__ */
+
+
diff --git a/backend/src/backend/gen_insn_compact.cpp b/backend/src/backend/gen_insn_compact.cpp
new file mode 100644
index 0000000..f19c364
--- /dev/null
+++ b/backend/src/backend/gen_insn_compact.cpp
@@ -0,0 +1,523 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Ruiling Song <ruiling.song at intel.com>
+ */
+#include "backend/gen_defs.hpp"
+#include "backend/gen_encoder.hpp"
+#include <cstring>
+
+namespace gbe {
+
+  struct compact_table_entry {
+    uint32_t bit_pattern;
+    uint32_t index;
+  };
+
+  static compact_table_entry control_table[] = {
+    {0b0000000000000000010, 0},
+    {0b0000100000000000000, 1},
+    {0b0000100000000000001, 2},
+    {0b0000100000000000010, 3},
+    {0b0000100000000000011, 4},
+    {0b0000100000000000100, 5},
+    {0b0000100000000000101, 6},
+    {0b0000100000000000111, 7},
+    {0b0000100000000001000, 8},
+    {0b0000100000000001001, 9},
+    {0b0000100000000001101, 10},
+    {0b0000110000000000000, 11},
+    {0b0000110000000000001, 12},
+    {0b0000110000000000010, 13},
+    {0b0000110000000000011, 14},
+    {0b0000110000000000100, 15},
+    {0b0000110000000000101, 16},
+    {0b0000110000000000111, 17},
+    {0b0000110000000001001, 18},
+    {0b0000110000000001101, 19},
+    {0b0000110000000010000, 20},
+    {0b0000110000100000000, 21},
+    {0b0001000000000000000, 22},
+    {0b0001000000000000010, 23},
+    {0b0001000000000000100, 24},
+    {0b0001000000100000000, 25},
+    {0b0010110000000000000, 26},
+    {0b0010110000000010000, 27},
+    {0b0011000000000000000, 28},
+    {0b0011000000100000000, 29},
+    {0b0101000000000000000, 30},
+    {0b0101000000100000000, 31},
+  };
+
+  static compact_table_entry data_type_table[] = {
+    {0b000000001000001100, 20},
+    {0b001000000000000001, 0},
+    {0b001000000000100000, 1},
+    {0b001000000000100001, 2},
+    {0b001000000000111101, 21},
+    {0b001000000001100001, 3},
+    {0b001000000010100101, 22},
+    {0b001000000010111101, 4},
+    {0b001000001011111101, 5},
+    {0b001000001110100001, 6},
+    {0b001000001110100101, 7},
+    {0b001000001110111101, 8},
+    {0b001000010000100000, 23},
+    {0b001000010000100001, 9},
+    {0b001000110000100000, 10},
+    {0b001000110000100001, 11},
+    {0b001001010010100100, 24},
+    {0b001001010010100101, 12},
+    {0b001001110010000100, 25},
+    {0b001001110010100100, 13},
+    {0b001001110010100101, 14},
+    {0b001010010100001001, 26},
+    {0b001010010100101000, 30},
+    {0b001010110100101000, 31},
+    {0b001011110110101100, 29},
+    {0b001101111110111101, 27},
+    {0b001111001110111101, 15},
+    {0b001111011110011101, 16},
+    {0b001111011110111100, 17},
+    {0b001111011110111101, 18},
+    {0b001111111110111100, 19},
+    {0b001111111110111101, 28},
+  };
+
+  static compact_table_entry data_type_decompact[] = {
+    {0b001000000000000001, 0},
+    {0b001000000000100000, 1},
+    {0b001000000000100001, 2},
+    {0b001000000001100001, 3},
+    {0b001000000010111101, 4},
+    {0b001000001011111101, 5},
+    {0b001000001110100001, 6},
+    {0b001000001110100101, 7},
+    {0b001000001110111101, 8},
+    {0b001000010000100001, 9},
+    {0b001000110000100000, 10},
+    {0b001000110000100001, 11},
+    {0b001001010010100101, 12},
+    {0b001001110010100100, 13},
+    {0b001001110010100101, 14},
+    {0b001111001110111101, 15},
+    {0b001111011110011101, 16},
+    {0b001111011110111100, 17},
+    {0b001111011110111101, 18},
+    {0b001111111110111100, 19},
+    {0b000000001000001100, 20},
+    {0b001000000000111101, 21},
+    {0b001000000010100101, 22},
+    {0b001000010000100000, 23},
+    {0b001001010010100100, 24},
+    {0b001001110010000100, 25},
+    {0b001010010100001001, 26},
+    {0b001101111110111101, 27},
+    {0b001111111110111101, 28},
+    {0b001011110110101100, 29},
+    {0b001010010100101000, 30},
+    {0b001010110100101000, 31},
+  };
+
+  static compact_table_entry subreg_table[] = {
+    {0b000000000000000, 0},
+    {0b000000000000001, 1},
+    {0b000000000001000, 2},
+    {0b000000000001111, 3},
+    {0b000000000010000, 4},
+    {0b000000010000000, 5},
+    {0b000000100000000, 6},
+    {0b000000110000000, 7},
+    {0b000001000000000, 8},
+    {0b000001000010000, 9},
+    {0b000001010000000, 10},
+    {0b001000000000000, 11},
+    {0b001000000000001, 12},
+    {0b001000010000001, 13},
+    {0b001000010000010, 14},
+    {0b001000010000011, 15},
+    {0b001000010000100, 16},
+    {0b001000010000111, 17},
+    {0b001000010001000, 18},
+    {0b001000010001110, 19},
+    {0b001000010001111, 20},
+    {0b001000110000000, 21},
+    {0b001000111101000, 22},
+    {0b010000000000000, 23},
+    {0b010000110000000, 24},
+    {0b011000000000000, 25},
+    {0b011110010000111, 26},
+    {0b100000000000000, 27},
+    {0b101000000000000, 28},
+    {0b110000000000000, 29},
+    {0b111000000000000, 30},
+    {0b111000000011100, 31},
+  };
+
+  static compact_table_entry srcreg_table[] = {
+    {0b000000000000, 0},
+    {0b000000000010, 1},
+    {0b000000010000, 2},
+    {0b000000010010, 3},
+    {0b000000011000, 4},
+    {0b000000100000, 5},
+    {0b000000101000, 6},
+    {0b000001001000, 7},
+    {0b000001010000, 8},
+    {0b000001110000, 9},
+    {0b000001111000, 10},
+    {0b001100000000, 11},
+    {0b001100000010, 12},
+    {0b001100001000, 13},
+    {0b001100010000, 14},
+    {0b001100010010, 15},
+    {0b001100100000, 16},
+    {0b001100101000, 17},
+    {0b001100111000, 18},
+    {0b001101000000, 19},
+    {0b001101000010, 20},
+    {0b001101001000, 21},
+    {0b001101010000, 22},
+    {0b001101100000, 23},
+    {0b001101101000, 24},
+    {0b001101110000, 25},
+    {0b001101110001, 26},
+    {0b001101111000, 27},
+    {0b010001101000, 28},
+    {0b010001101001, 29},
+    {0b010001101010, 30},
+    {0b010110001000, 31},
+  };
+
+  static int cmp_key(const void *p1, const void*p2) {
+    const compact_table_entry * px = (compact_table_entry *)p1;
+    const compact_table_entry * py = (compact_table_entry *)p2;
+    return (px->bit_pattern) - py->bit_pattern;
+  }
+  union ControlBits{
+    struct {
+      uint32_t access_mode:1;
+      uint32_t mask_control:1;
+      uint32_t dependency_control:2;
+      uint32_t quarter_control:2;
+      uint32_t thread_control:2;
+      uint32_t predicate_control:4;
+      uint32_t predicate_inverse:1;
+      uint32_t execution_size:3;
+      uint32_t saturate:1;
+      uint32_t flag_sub_reg_nr:1;
+      uint32_t flag_reg_nr:1;
+      uint32_t pad:23;
+    };
+    uint32_t data;
+  };
+  union DataTypeBits{
+    struct {
+      uint32_t dest_reg_file:2;
+      uint32_t dest_reg_type:3;
+      uint32_t src0_reg_file:2;
+      uint32_t src0_reg_type:3;
+      uint32_t src1_reg_file:2;
+      uint32_t src1_reg_type:3;
+      uint32_t dest_horiz_stride:2;
+      uint32_t dest_address_mode:1;
+      uint32_t pad:14;
+    };
+    uint32_t data;
+  };
+  union SubRegBits {
+    struct {
+      uint32_t dest_subreg_nr:5;
+      uint32_t src0_subreg_nr:5;
+      uint32_t src1_subreg_nr:5;
+      uint32_t pad:17;
+    };
+    uint32_t data;
+  };
+  union SrcRegBits {
+    struct {
+      uint32_t src_abs:1;
+      uint32_t src_negate:1;
+      uint32_t src_address_mode:1;
+      uint32_t src_horiz_stride:2;
+      uint32_t src_width:3;
+      uint32_t src_vert_stride:4;
+      uint32_t pad:20;
+    };
+    uint32_t data;
+  };
+
+  void decompactInstruction(GenCompactInstruction * p, GenNativeInstruction *pOut) {
+
+    memset(pOut, 0, sizeof(GenNativeInstruction));
+    union ControlBits control_bits;
+    control_bits.data = control_table[(uint32_t)p->bits1.control_index].bit_pattern;
+    pOut->low.low = (uint32_t)p->bits1.opcode | ((control_bits.data & 0xffff) << 8);
+    pOut->header.destreg_or_condmod = p->bits1.destreg_or_condmod;
+    pOut->header.saturate = control_bits.saturate;
+    pOut->header.acc_wr_control = p->bits1.acc_wr_control;
+    pOut->header.cmpt_control = p->bits1.cmpt_control;
+    pOut->header.debug_control = p->bits1.debug_control;
+
+    union DataTypeBits data_type_bits;
+    union SubRegBits subreg_bits;
+    union SrcRegBits src0_bits;
+    data_type_bits.data = data_type_decompact[(uint32_t)p->bits1.data_type_index].bit_pattern;
+    subreg_bits.data = subreg_table[(uint32_t)p->bits1.sub_reg_index].bit_pattern;
+    src0_bits.data = srcreg_table[p->bits1.src0_index_lo | p->bits2.src0_index_hi << 2].bit_pattern;
+
+    pOut->low.high |= data_type_bits.data & 0x7fff;
+    pOut->bits1.da1.dest_horiz_stride = data_type_bits.dest_horiz_stride;
+    pOut->bits1.da1.dest_address_mode = data_type_bits.dest_address_mode;
+    pOut->bits1.da1.dest_reg_nr = p->bits2.dest_reg_nr;
+    pOut->bits1.da1.dest_subreg_nr = subreg_bits.dest_subreg_nr;
+
+    pOut->bits2.da1.src0_subreg_nr = subreg_bits.src0_subreg_nr;
+    pOut->bits2.da1.src0_reg_nr = p->bits2.src0_reg_nr;
+    pOut->high.low |= (src0_bits.data << 13);
+    pOut->bits2.da1.flag_sub_reg_nr = control_bits.flag_sub_reg_nr;
+    pOut->bits2.da1.flag_reg_nr = control_bits.flag_reg_nr;
+
+    if(data_type_bits.src1_reg_file == GEN_IMMEDIATE_VALUE) {
+      uint32_t imm = (uint32_t)p->bits2.src1_reg_nr | (p->bits2.src1_index<<8);
+      pOut->bits3.ud = imm & 0x1000 ? (imm | 0xfffff000) : imm;
+    } else {
+      union SrcRegBits src1_bits;
+      src1_bits.data = srcreg_table[p->bits2.src1_index].bit_pattern;
+      pOut->bits3.da1.src1_subreg_nr = subreg_bits.src1_subreg_nr;
+      pOut->bits3.da1.src1_reg_nr = p->bits2.src1_reg_nr;
+      pOut->high.high |= (src1_bits.data << 13);
+    }
+  }
+
+  int compactControlBits(GenEncoder *p, uint32_t quarter, uint32_t execWidth) {
+
+    const GenInstructionState *s = &p->curr;
+    // some quick check
+    if(s->nibControl != 0)
+      return -1;
+    if(s->predicate > GEN_PREDICATE_NORMAL)
+      return -1;
+    if(s->flag == 1)
+      return -1;
+
+    ControlBits b;
+    b.data = 0;
+
+    if (execWidth == 8)
+      b.execution_size = GEN_WIDTH_8;
+    else if (execWidth == 16)
+      b.execution_size = GEN_WIDTH_16;
+    else if (execWidth == 4)
+      b.execution_size = GEN_WIDTH_4;
+    else if (execWidth == 1)
+      b.execution_size = GEN_WIDTH_1;
+    else
+      NOT_IMPLEMENTED;
+
+    b.mask_control = s->noMask;
+    b.quarter_control = quarter;
+    b.predicate_control = s->predicate;
+    b.predicate_inverse = s->inversePredicate;
+
+    b.saturate = s->saturate;
+    b.flag_sub_reg_nr = s->subFlag;
+    b.flag_reg_nr = s->flag;
+
+    compact_table_entry key;
+    key.bit_pattern = b.data;
+
+    compact_table_entry *r = (compact_table_entry *)bsearch(&key, control_table,
+      sizeof(control_table)/sizeof(compact_table_entry), sizeof(compact_table_entry), cmp_key);
+    if (r == NULL)
+      return -1;
+    return r->index;
+  }
+
+  int compactDataTypeBits(GenEncoder *p, GenRegister *dst, GenRegister *src0, GenRegister *src1) {
+
+    // compact does not support any indirect acess
+    if(dst->address_mode != GEN_ADDRESS_DIRECT)
+      return -1;
+
+    if(src0->file == GEN_IMMEDIATE_VALUE)
+      return -1;
+
+    DataTypeBits b;
+    b.data = 0;
+
+    b.dest_horiz_stride = dst->hstride == GEN_HORIZONTAL_STRIDE_0 ? GEN_HORIZONTAL_STRIDE_1 : dst->hstride;
+    b.dest_address_mode = dst->address_mode;
+    b.dest_reg_file = dst->file;
+    b.dest_reg_type = dst->type;
+
+    b.src0_reg_file = src0->file;
+    b.src0_reg_type = src0->type;
+
+    if(src1) {
+      b.src1_reg_type = src1->type;
+      b.src1_reg_file = src1->file;
+    } else {
+      // default to zero
+      b.src1_reg_type = 0;
+      b.src1_reg_file = 0;
+    }
+
+    compact_table_entry key;
+    key.bit_pattern = b.data;
+
+    compact_table_entry *r = (compact_table_entry *)bsearch(&key, data_type_table,
+                             sizeof(data_type_table)/sizeof(compact_table_entry), sizeof(compact_table_entry), cmp_key);
+    if (r == NULL)
+      return -1;
+    return r->index;
+  }
+  int compactSubRegBits(GenEncoder *p, GenRegister *dst, GenRegister *src0, GenRegister *src1) {
+    SubRegBits b;
+    b.data = 0;
+    b.dest_subreg_nr = dst->subnr;
+    b.src0_subreg_nr = src0->subnr;
+    if(src1)
+      b.src1_subreg_nr = src1->subnr;
+    else
+      b.src1_subreg_nr = 0;
+
+    compact_table_entry key;
+    key.bit_pattern = b.data;
+
+    compact_table_entry *r = (compact_table_entry *)bsearch(&key, subreg_table,
+                sizeof(subreg_table)/sizeof(compact_table_entry), sizeof(compact_table_entry), cmp_key);
+    if (r == NULL)
+      return -1;
+    return r->index;
+  }
+  int compactSrcRegBits(GenEncoder *p, GenRegister *src) {
+    // As we only use GEN_ALIGN_1 and compact only support direct register access,
+    // we only need to verify [hstride, width, vstride]
+    if(src->file == GEN_IMMEDIATE_VALUE)
+      return -1;
+    if(src->address_mode != GEN_ADDRESS_DIRECT)
+      return -1;
+
+    SrcRegBits b;
+    b.data = 0;
+    b.src_abs = src->absolute;
+    b.src_negate = src->negation;
+    b.src_address_mode = src->address_mode;
+    if(p->curr.execWidth == 1 && src->width == GEN_WIDTH_1) {
+      b.src_width = src->width;
+      b.src_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+      b.src_vert_stride = GEN_VERTICAL_STRIDE_0;
+    }
+    else {
+      b.src_horiz_stride = src->hstride;
+      b.src_width = src->width;
+      b.src_vert_stride = src->vstride;
+    }
+    compact_table_entry key;
+    key.bit_pattern = b.data;
+
+    compact_table_entry *r = (compact_table_entry *)bsearch(&key, srcreg_table,
+                    sizeof(srcreg_table)/sizeof(compact_table_entry), sizeof(compact_table_entry), cmp_key);
+    if (r == NULL)
+      return -1;
+    return r->index;
+  }
+
+  bool compactAlu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src, uint32_t condition, bool split) {
+    if(split) {
+      // TODO support it
+      return false;
+    } else {
+      int control_index = compactControlBits(p, p->curr.quarterControl, p->curr.execWidth);
+      if(control_index == -1) return false;
+
+      int data_type_index = compactDataTypeBits(p, &dst, &src, NULL);
+      if(data_type_index == -1) return false;
+
+      int sub_reg_index = compactSubRegBits(p, &dst, &src, NULL);
+      if(sub_reg_index == -1) return false;
+
+      int src_reg_index = compactSrcRegBits(p, &src);
+      if(src_reg_index == -1) return false;
+
+      GenCompactInstruction * insn = p->nextCompact(opcode);
+      insn->bits1.control_index = control_index;
+      insn->bits1.data_type_index = data_type_index;
+      insn->bits1.sub_reg_index = sub_reg_index;
+      insn->bits1.acc_wr_control = p->curr.accWrEnable;
+      insn->bits1.destreg_or_condmod = condition;
+      insn->bits1.cmpt_control = 1;
+      insn->bits1.src0_index_lo = src_reg_index & 3;
+
+      insn->bits2.src0_index_hi = src_reg_index >> 2;
+      insn->bits2.src1_index = 0;
+      insn->bits2.dest_reg_nr = dst.nr;
+      insn->bits2.src0_reg_nr = src.nr;
+      insn->bits2.src1_reg_nr = 0;
+      return true;
+    }
+  }
+
+  bool compactAlu2(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1, uint32_t condition, bool split) {
+    if(split) {
+      // TODO support it
+      return false;
+    } else {
+      if(opcode == GEN_OPCODE_IF  || opcode == GEN_OPCODE_ENDIF || opcode == GEN_OPCODE_JMPI) return false;
+
+      int control_index = compactControlBits(p, p->curr.quarterControl, p->curr.execWidth);
+      if(control_index == -1) return false;
+
+      int data_type_index = compactDataTypeBits(p, &dst, &src0, &src1);
+      if(data_type_index == -1) return false;
+
+      int sub_reg_index = compactSubRegBits(p, &dst, &src0, &src1);
+      if(sub_reg_index == -1) return false;
+
+      int src0_reg_index = compactSrcRegBits(p, &src0);
+      if(src0_reg_index == -1) return false;
+
+      bool src1_imm = false;
+      int src1_reg_index;
+      if(src1.file == GEN_IMMEDIATE_VALUE) {
+        if(src1.absolute != 0 || src1.negation != 0 || src1.type == GEN_TYPE_F)
+          return false;
+        if(src1.value.d < -4096 || src1.value.d > 4095) // 13bit signed imm
+          return false;
+        src1_imm = true;
+      } else {
+        src1_reg_index = compactSrcRegBits(p, &src1);
+        if(src1_reg_index == -1) return false;
+      }
+      GenCompactInstruction * insn = p->nextCompact(opcode);
+      insn->bits1.control_index = control_index;
+      insn->bits1.data_type_index = data_type_index;
+      insn->bits1.sub_reg_index = sub_reg_index;
+      insn->bits1.acc_wr_control = p->curr.accWrEnable;
+      insn->bits1.destreg_or_condmod = condition;
+      insn->bits1.cmpt_control = 1;
+      insn->bits1.src0_index_lo = src0_reg_index & 3;
+
+      insn->bits2.src0_index_hi = src0_reg_index >> 2;
+      insn->bits2.src1_index = src1_imm ? (src1.value.ud & 8191)>> 8 : src1_reg_index;
+      insn->bits2.dest_reg_nr = dst.nr;
+      insn->bits2.src0_reg_nr = src0.nr;
+      insn->bits2.src1_reg_nr = src1_imm ? (src1.value.ud & 0xff): src1.nr;
+      return true;
+    }
+  }
+};
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
new file mode 100644
index 0000000..8535b4a
--- /dev/null
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -0,0 +1,42 @@
+//                 Family     Latency     SIMD16     SIMD8
+DECL_GEN7_SCHEDULE(Label,           0,         0,        0)
+DECL_GEN7_SCHEDULE(Unary,           20,        4,        2)
+DECL_GEN7_SCHEDULE(UnaryWithTemp,   20,        40,      20)
+DECL_GEN7_SCHEDULE(Binary,          20,        4,        2)
+DECL_GEN7_SCHEDULE(BinaryWithTemp,  20,        40,      20)
+DECL_GEN7_SCHEDULE(Ternary,         20,        4,        2)
+DECL_GEN7_SCHEDULE(I64Shift,        20,        40,      20)
+DECL_GEN7_SCHEDULE(I64HADD,         20,        40,      20)
+DECL_GEN7_SCHEDULE(I64RHADD,        20,        40,      20)
+DECL_GEN7_SCHEDULE(I64ToFloat,      20,        40,      20)
+DECL_GEN7_SCHEDULE(FloatToI64,      20,        40,      20)
+DECL_GEN7_SCHEDULE(I64MULHI,        20,        40,      20)
+DECL_GEN7_SCHEDULE(I64MADSAT,       20,        40,      20)
+DECL_GEN7_SCHEDULE(Compare,         20,        4,        2)
+DECL_GEN7_SCHEDULE(I64Compare,      20,        80,      20)
+DECL_GEN7_SCHEDULE(I64DIVREM,       20,        80,      20)
+DECL_GEN7_SCHEDULE(Jump,            14,        1,        1)
+DECL_GEN7_SCHEDULE(IndirectMove,    20,        2,        2)
+DECL_GEN7_SCHEDULE(Eot,             20,        1,        1)
+DECL_GEN7_SCHEDULE(NoOp,            20,        2,        2)
+DECL_GEN7_SCHEDULE(Wait,            20,        2,        2)
+DECL_GEN7_SCHEDULE(Math,            20,        4,        2)
+DECL_GEN7_SCHEDULE(Barrier,         80,        1,        1)
+DECL_GEN7_SCHEDULE(Fence,           80,        1,        1)
+DECL_GEN7_SCHEDULE(Read64,          80,        1,        1)
+DECL_GEN7_SCHEDULE(Write64,         80,        1,        1)
+DECL_GEN7_SCHEDULE(UntypedRead,     160,       1,        1)
+DECL_GEN7_SCHEDULE(UntypedWrite,    160,       1,        1)
+DECL_GEN7_SCHEDULE(ByteGather,      160,       1,        1)
+DECL_GEN7_SCHEDULE(ByteScatter,     160,       1,        1)
+DECL_GEN7_SCHEDULE(DWordGather,     160,       1,        1)
+DECL_GEN7_SCHEDULE(PackByte,        40,        1,        1)
+DECL_GEN7_SCHEDULE(UnpackByte,      40,        1,        1)
+DECL_GEN7_SCHEDULE(Sample,          160,       1,        1)
+DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
+DECL_GEN7_SCHEDULE(SpillReg,        20,        1,        1)
+DECL_GEN7_SCHEDULE(UnSpillReg,      160,       1,        1)
+DECL_GEN7_SCHEDULE(Atomic,          80,        1,        1)
+DECL_GEN7_SCHEDULE(I64MUL,          20,        40,      20)
+DECL_GEN7_SCHEDULE(I64SATADD,       20,        40,      20)
+DECL_GEN7_SCHEDULE(I64SATSUB,       20,        40,      20)
diff --git a/backend/src/backend/gen_insn_scheduling.cpp b/backend/src/backend/gen_insn_scheduling.cpp
new file mode 100644
index 0000000..106d608
--- /dev/null
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -0,0 +1,722 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_insn_scheduling.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Overall idea:
+ * =============
+ *
+ * This is the instruction scheduling part of the code. With Gen, we actually
+ * have a simple strategy to follow. Indeed, here are the constraints:
+ *
+ * 1 - the number of registers per HW thread is constant and given (128 32 bytes
+ * GRF per thread). So, we can use all these registers with no penalty
+ * 2 - spilling is super bad. Instruction latency matters but the top priority
+ * is to avoid as much as possible spilling
+ *
+ *
+ * We schedule twice using at each time a local forward list scheduler
+ *
+ * Before the register allocation
+ * ==============================
+ *
+ * We try to limit the register pressure.
+ * Well, this is a hard problem and we have a decent strategy now that we called
+ * "zero cycled LIFO scheduling".
+ * We use a local forward list scheduling and we schedule the instructions in a
+ * LIFO order i.e. as a stack. Basically, we take the most recent instruction
+ * and schedule it right away. Obviously we ignore completely the real latencies
+ * and throuputs and just simulate instructions that are issued and completed in
+ * zero cycle. For the complex kernels we already have (like menger sponge),
+ * this provides a pretty good strategy enabling SIMD16 code generation where
+ * when scheduling is deactivated, even SIMD8 fails
+ *
+ * One may argue that this strategy is bad, latency wise. This is not true since
+ * the register allocator will anyway try to burn as many registers as possible.
+ * So, there is still opportunities to schedule after register allocation.
+ *
+ * Our idea seems to work decently. There is however a strong research article
+ * that is able to near-optimally reschudle the instructions to minimize
+ * register use. This is:
+ *
+ * "Minimum Register Instruction Sequence Problem: Revisiting Optimal Code
+ *  Generation for DAGs"
+ *
+ * After the register allocation
+ * ==============================
+ *
+ * This is here a pretty simple strategy based on a regular forward list
+ * scheduling. Since Gen is a co-issue based machine, this is useless to take
+ * into account really precise timings since instruction issues will happen
+ * out-of-order based on other thread executions.
+ *
+ * Note that we over-simplify the problem. Indeed, Gen register file is flexible
+ * and we are able to use sub-registers of GRF in particular when we handle
+ * uniforms or mask registers which are spilled in GRFs. Thing is that two
+ * uniforms may not interfere even if they belong to the same GRF (i.e. they use
+ * two different sub-registers). This means that the interference relation is
+ * not transitive for Gen. To simplify everything, we just take consider full
+ * GRFs (in SIMD8) or double full GRFs (in SIMD16) regardless of the fact this
+ * is a uniform, a mask or a regular GRF.
+ *
+ * Obviously, this leads to extra dependencies in the code.
+ */
+
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_reg_allocation.hpp"
+#include "sys/cvar.hpp"
+#include "sys/intrusive_list.hpp"
+
+namespace gbe
+{
+  // Helper structure to schedule the basic blocks
+  struct SelectionScheduler;
+
+  // Node for the schedule DAG
+  struct ScheduleDAGNode;
+
+  typedef enum {
+    WRITE_AFTER_WRITE,
+    WRITE_AFTER_READ,
+    READ_AFTER_WRITE,
+    READ_AFTER_WRITE_MEMORY
+  } DepMode;
+
+  /*! We need to chain together the node we point */
+  struct ScheduleListNode : public intrusive_list_node
+  {
+    INLINE ScheduleListNode(ScheduleDAGNode *node, DepMode m = READ_AFTER_WRITE) : node(node), depMode(m) {}
+    ScheduleDAGNode *node;
+    DepMode depMode;
+  };
+
+  /*! Node of the DAG */
+  struct ScheduleDAGNode
+  {
+    INLINE ScheduleDAGNode(SelectionInstruction &insn) :
+      insn(insn), refNum(0), retiredCycle(0), preRetired(false), readDistance(0x7fffffff) {}
+    bool dependsOn(ScheduleDAGNode *node) const {
+      GBE_ASSERT(node != NULL);
+      for (auto child : node->children)
+        if (child.node == this)
+          return true;
+      return false;
+    }
+    /*! Children that depends on us */
+    intrusive_list<ScheduleListNode> children;
+    /*! Instruction after code selection */
+    SelectionInstruction &insn;
+    /*! Number of nodes that point to us (i.e. nodes we depend on) */
+    uint32_t refNum;
+    /*! Cycle when the instruction is retired */
+    uint32_t retiredCycle;
+    bool preRetired;
+    uint32_t readDistance;
+  };
+
+  /*! To track loads and stores */
+  enum GenMemory : uint8_t {
+    GLOBAL_MEMORY = 0,
+    LOCAL_MEMORY,
+    SCRATCH_MEMORY,
+    MAX_MEM_SYSTEM
+  };
+
+  /*! Do we allocate after or before the register allocation? */
+  enum SchedulePolicy {
+    PRE_ALLOC = 0, // LIFO scheduling (tends to limit register pressure)
+    POST_ALLOC     // FIFO scheduling (limits latency problems)
+  };
+
+  /*! Helper structure to handle dependencies while scheduling. Takes into
+   *  account virtual and physical registers and memory sub-systems
+   */
+  struct DependencyTracker : public NonCopyable
+  {
+    DependencyTracker(const Selection &selection, SelectionScheduler &scheduler);
+    /*! Reset it before scheduling a new block */
+    void clear(bool fullClear = false);
+    /*! Get an index in the node array for the given register */
+    uint32_t getIndex(GenRegister reg) const;
+    /*! Get an index in the node array for the given memory system */
+    uint32_t getIndex(uint32_t bti) const;
+    /*! Add a new dependency "node0 depends on node1" */
+    void addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1, DepMode m);
+    /*! Add a new dependency "node0 depends on node located at index" */
+    void addDependency(ScheduleDAGNode *node0, uint32_t index, DepMode m);
+    /*! Add a new dependency "node located at index depends on node0" */
+    void addDependency(uint32_t index, ScheduleDAGNode *node0, DepMode m);
+    /*! No dependency for null registers and immediate */
+    INLINE bool ignoreDependency(GenRegister reg) const {
+      if (reg.file == GEN_IMMEDIATE_VALUE)
+        return true;
+      else if (reg.file == GEN_ARCHITECTURE_REGISTER_FILE) {
+        if ((reg.nr & 0xf0) == GEN_ARF_NULL)
+          return true;
+      }
+      return false;
+    }
+    /*! Owns the tracker */
+    SelectionScheduler &scheduler;
+    /*! Add a new dependency "node0 depends on node set for register reg" */
+    void addDependency(ScheduleDAGNode *node0, GenRegister reg, DepMode m);
+    /*! Add a new dependency "node set for register reg depends on node0" */
+    void addDependency(GenRegister reg, ScheduleDAGNode *node0, DepMode m);
+    /*! Make the node located at insnID a barrier */
+    void makeBarrier(int32_t insnID, int32_t insnNum);
+    /*! Update all the writes (memory, predicates, registers) */
+    void updateWrites(ScheduleDAGNode *node);
+    /*! Maximum number of *physical* flag registers */
+    static const uint32_t MAX_FLAG_REGISTER = 8u;
+    /*! Maximum number of *physical* accumulators registers */
+    static const uint32_t MAX_ACC_REGISTER = 1u;
+    /*! Stores the last node that wrote to a register / memory ... */
+    vector<ScheduleDAGNode*> nodes;
+    /*! store nodes each node depends on */
+    map<ScheduleDAGNode *, vector<ScheduleDAGNode*>> deps;
+    /*! Stores the nodes per instruction */
+    vector<ScheduleDAGNode*> insnNodes;
+    /*! Number of virtual register in the selection */
+    uint32_t grfNum;
+  };
+
+  /*! Perform the instruction scheduling */
+  struct SelectionScheduler : public NonCopyable
+  {
+    /*! Init the book keeping structures */
+    SelectionScheduler(GenContext &ctx, Selection &selection, SchedulePolicy policy);
+    /*! Make all lists empty */
+    void clearLists(void);
+    /*! Return the number of instructions to schedule in the DAG */
+    int32_t buildDAG(SelectionBlock &bb);
+    /*! traverse read node and update read distance for all the child. */
+    void traverseReadNode(ScheduleDAGNode *node, uint32_t degree = 0);
+    /*! Schedule the DAG, pre register allocation and post register allocation. */
+    void preScheduleDAG(SelectionBlock &bb, int32_t insnNum);
+    void postScheduleDAG(SelectionBlock &bb, int32_t insnNum);
+    /*! To limit register pressure or limit insn latency problems */
+    SchedulePolicy policy;
+    /*! Make ScheduleListNode allocation faster */
+    DECL_POOL(ScheduleListNode, listPool);
+    /*! Make ScheduleDAGNode allocation faster */
+    DECL_POOL(ScheduleDAGNode, nodePool);
+    /*! Ready list is instructions that can be scheduled */
+    intrusive_list<ScheduleListNode> ready;
+    /*! Active list is instructions that are executing */
+    intrusive_list<ScheduleListNode> active;
+    /*! Handle complete compilation */
+    GenContext &ctx;
+    /*! Code to schedule */
+    Selection &selection;
+    /*! To help tracking dependencies */
+    DependencyTracker tracker;
+  };
+
+  DependencyTracker::DependencyTracker(const Selection &selection, SelectionScheduler &scheduler) :
+    scheduler(scheduler)
+  {
+    if (scheduler.policy == PRE_ALLOC) {
+      this->grfNum = selection.getRegNum();
+      nodes.resize(grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER + MAX_MEM_SYSTEM);
+    } else {
+      const uint32_t simdWidth = scheduler.ctx.getSimdWidth();
+      GBE_ASSERT(simdWidth == 8 || simdWidth == 16);
+      this->grfNum = simdWidth == 8 ? 128 : 64;
+      nodes.resize(grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER + MAX_MEM_SYSTEM);
+    }
+    insnNodes.resize(selection.getLargestBlockSize());
+  }
+
+  void DependencyTracker::clear(bool fullClear) { for (auto &x : nodes) x = NULL; if (fullClear) deps.clear(); }
+  void DependencyTracker::addDependency(ScheduleDAGNode *node0, GenRegister reg, DepMode m) {
+    if (this->ignoreDependency(reg) == false) {
+      const uint32_t index = this->getIndex(reg);
+      this->addDependency(node0, index, m);
+      if (scheduler.policy == POST_ALLOC && (reg.isdf() || reg.isint64()))
+        this->addDependency(node0, index + 1, m);
+    }
+  }
+
+  void DependencyTracker::addDependency(GenRegister reg, ScheduleDAGNode *node0, DepMode m) {
+    if (this->ignoreDependency(reg) == false) {
+      const uint32_t index = this->getIndex(reg);
+      this->addDependency(index, node0, m);
+      if (scheduler.policy == POST_ALLOC && (reg.isdf() || reg.isint64()))
+        this->addDependency(index + 1, node0, m);
+    }
+  }
+
+  void DependencyTracker::addDependency(ScheduleDAGNode *node0, ScheduleDAGNode *node1, DepMode depMode) {
+    if (node0 != NULL && node1 != NULL && node0 != node1 && node0->dependsOn(node1) == false) {
+      if (node1->insn.isRead())
+        depMode = depMode == READ_AFTER_WRITE ? READ_AFTER_WRITE_MEMORY : depMode;
+      ScheduleListNode *dep = scheduler.newScheduleListNode(node0, depMode);
+      node0->refNum++;
+      node1->children.push_back(dep);
+      auto it = deps.find(node0);
+      if (it != deps.end()) {
+        it->second.push_back(node1);
+      } else {
+        vector<ScheduleDAGNode*> vn;
+        vn.push_back(node1);
+        deps.insert(std::make_pair(node0, vn));
+      }
+    }
+  }
+
+  void DependencyTracker::addDependency(ScheduleDAGNode *node, uint32_t index, DepMode m) {
+    this->addDependency(node, this->nodes[index], m);
+  }
+
+  void DependencyTracker::addDependency(uint32_t index, ScheduleDAGNode *node, DepMode m) {
+    this->addDependency(this->nodes[index], node, m);
+  }
+
+  void DependencyTracker::makeBarrier(int32_t barrierID, int32_t insnNum) {
+    ScheduleDAGNode *barrier = this->insnNodes[barrierID];
+
+    // The barrier depends on all nodes before it
+    for (int32_t insnID = 0; insnID < barrierID; ++insnID)
+      this->addDependency(barrier, this->insnNodes[insnID], WRITE_AFTER_WRITE);
+
+    // All nodes after barriers depend on the barrier
+    for (int32_t insnID = barrierID + 1; insnID < insnNum; ++insnID)
+      this->addDependency(this->insnNodes[insnID], barrier, WRITE_AFTER_WRITE);
+  }
+
+  static GenRegister getFlag(const SelectionInstruction &insn) {
+    if (insn.state.physicalFlag) {
+      const uint32_t nr = insn.state.flag;
+      const uint32_t subnr = insn.state.subFlag;
+      return GenRegister::flag(nr, subnr);
+    } else
+      return GenRegister::uw1grf(ir::Register(insn.state.flagIndex));
+  }
+
+  uint32_t DependencyTracker::getIndex(GenRegister reg) const {
+    // Non GRF physical register
+    if (reg.physical) {
+      //GBE_ASSERT (reg.file == GEN_ARCHITECTURE_REGISTER_FILE);
+      if(reg.file == GEN_ARCHITECTURE_REGISTER_FILE) {
+        const uint32_t file = reg.nr & 0xf0;
+        const uint32_t nr = reg.nr & 0x0f;
+        if (file == GEN_ARF_FLAG) {
+          const uint32_t subnr = reg.subnr / sizeof(uint16_t);
+          GBE_ASSERT(nr < MAX_FLAG_REGISTER && (subnr == 0 || subnr == 1));
+          return grfNum + 2*nr + subnr;
+        } else if (file == GEN_ARF_ACCUMULATOR) {
+          GBE_ASSERT(nr < MAX_ACC_REGISTER);
+          return grfNum + MAX_FLAG_REGISTER + nr;
+        } else {
+          NOT_SUPPORTED;
+          return 0;
+        }
+      } else {
+          const uint32_t simdWidth = scheduler.ctx.getSimdWidth();
+          return simdWidth == 8 ? reg.nr : reg.nr / 2;
+      }
+    }
+    // We directly manipulate physical GRFs here
+    else if (scheduler.policy == POST_ALLOC) {
+      const GenRegister physical = scheduler.ctx.ra->genReg(reg);
+      const uint32_t simdWidth = scheduler.ctx.getSimdWidth();
+      return simdWidth == 8 ? physical.nr : physical.nr / 2;
+    }
+    // We use virtual registers since allocation is not done yet
+    else
+      return reg.value.reg;
+  }
+
+  uint32_t DependencyTracker::getIndex(uint32_t bti) const {
+    const uint32_t memDelta = grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER;
+    return bti == 0xfe ? memDelta + LOCAL_MEMORY : (bti == 0xff ? memDelta + SCRATCH_MEMORY : memDelta + GLOBAL_MEMORY);
+  }
+
+  void DependencyTracker::updateWrites(ScheduleDAGNode *node) {
+    const SelectionInstruction &insn = node->insn;
+
+    // Track writes in registers
+    for (uint32_t dstID = 0; dstID < insn.dstNum; ++dstID) {
+      const GenRegister dst = insn.dst(dstID);
+      if (this->ignoreDependency(dst) == false) {
+        const uint32_t index = this->getIndex(dst);
+        this->nodes[index] = node;
+        if (scheduler.policy == POST_ALLOC && (dst.isdf() || dst.isint64()))
+          this->nodes[index + 1] = node;
+      }
+    }
+
+    // Track writes in predicates
+    if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP || insn.state.modFlag) {
+      const uint32_t index = this->getIndex(getFlag(insn));
+      this->nodes[index] = node;
+    }
+
+    // Track writes in accumulators
+    if (insn.state.accWrEnable) {
+      const uint32_t index = this->getIndex(GenRegister::acc());
+      this->nodes[index] = node;
+    }
+
+    // Track writes in memory
+    if (insn.isWrite()) {
+      const uint32_t index = this->getIndex(insn.getbti());
+      this->nodes[index] = node;
+    }
+
+    // Track writes in scratch memory
+    if(insn.opcode == SEL_OP_SPILL_REG) {
+      const uint32_t index = this->getIndex(0xff);
+      this->nodes[index] = node;
+    }
+    // Consider barriers and wait write to memory
+    if (insn.opcode == SEL_OP_BARRIER ||
+        insn.opcode == SEL_OP_FENCE ||
+        insn.opcode == SEL_OP_WAIT) {
+      const uint32_t local = this->getIndex(0xfe);
+      const uint32_t global = this->getIndex(0x00);
+      this->nodes[local] = this->nodes[global] = node;
+    }
+  }
+
+  /*! Kind-of roughly estimated latency. Nothing real here */
+  static uint32_t getLatencyGen7(const SelectionInstruction &insn) {
+#define DECL_GEN7_SCHEDULE(FAMILY, LATENCY, SIMD16, SIMD8)\
+    const uint32_t FAMILY##InstructionLatency = LATENCY;
+#include "gen_insn_gen7_schedule_info.hxx"
+#undef DECL_GEN7_SCHEDULE
+
+    switch (insn.opcode) {
+#define DECL_SELECTION_IR(OP, FAMILY) case SEL_OP_##OP: return FAMILY##Latency;
+#include "backend/gen_insn_selection.hxx"
+#undef DECL_SELECTION_IR
+    };
+    return 0;
+  }
+
+  /*! Throughput in cycles for SIMD8 or SIMD16 */
+  static uint32_t getThroughputGen7(const SelectionInstruction &insn, bool isSIMD8) {
+#define DECL_GEN7_SCHEDULE(FAMILY, LATENCY, SIMD16, SIMD8)\
+    const uint32_t FAMILY##InstructionThroughput = isSIMD8 ? SIMD8 : SIMD16;
+#include "gen_insn_gen7_schedule_info.hxx"
+#undef DECL_GEN7_SCHEDULE
+
+    switch (insn.opcode) {
+#define DECL_SELECTION_IR(OP, FAMILY) case SEL_OP_##OP: return FAMILY##Throughput;
+#include "backend/gen_insn_selection.hxx"
+#undef DECL_SELECTION_IR
+    };
+    return 0;
+  }
+
+  SelectionScheduler::SelectionScheduler(GenContext &ctx,
+                                         Selection &selection,
+                                         SchedulePolicy policy) :
+    policy(policy), listPool(nextHighestPowerOf2(selection.getLargestBlockSize())),
+    ctx(ctx), selection(selection), tracker(selection, *this)
+  {
+    this->clearLists();
+  }
+
+  void SelectionScheduler::clearLists(void) {
+    this->ready.fast_clear();
+    this->active.fast_clear();
+  }
+
+  void SelectionScheduler::traverseReadNode(ScheduleDAGNode *node, uint32_t degree) {
+    GBE_ASSERT(degree != 0 || node->insn.isRead());
+    if (node->readDistance != 0x7FFFFFFF)
+      return;
+    node->readDistance = degree;
+    if (degree > 5)
+      return;
+    //printf("node id %d op %d degree %d \n", node->insn.ID, node->insn.opcode, degree);
+    auto it = tracker.deps.find(node);
+    if (it != tracker.deps.end()) {
+      for (auto &depNode : it->second) {
+        if (depNode && !depNode->insn.isRead())
+          traverseReadNode(depNode, degree + 1);
+      }
+    }
+  }
+
+  int32_t SelectionScheduler::buildDAG(SelectionBlock &bb) {
+    nodePool.rewind();
+    listPool.rewind();
+    tracker.clear(true);
+    this->clearLists();
+
+    // Track write-after-write and read-after-write dependencies
+    int32_t insnNum = 0;
+    for (auto &insn : bb.insnList) {
+      // Create a new node for this instruction
+      ScheduleDAGNode *node = this->newScheduleDAGNode(insn);
+      tracker.insnNodes[insnNum++] = node;
+
+      // read-after-write in registers
+      for (uint32_t srcID = 0; srcID < insn.srcNum; ++srcID)
+        tracker.addDependency(node, insn.src(srcID), READ_AFTER_WRITE);
+
+      // read-after-write for predicate
+      if (insn.state.predicate != GEN_PREDICATE_NONE)
+        tracker.addDependency(node, getFlag(insn), READ_AFTER_WRITE);
+
+      // read-after-write in memory
+      if (insn.isRead()) {
+        const uint32_t index = tracker.getIndex(insn.getbti());
+        tracker.addDependency(node, index, READ_AFTER_WRITE);
+      }
+      //read-after-write of scratch memory
+      if (insn.opcode == SEL_OP_UNSPILL_REG) {
+        const uint32_t index = tracker.getIndex(0xff);
+        tracker.addDependency(node, index, READ_AFTER_WRITE);
+      }
+
+      // Consider barriers and wait are reading memory (local and global)
+    if (insn.opcode == SEL_OP_BARRIER ||
+        insn.opcode == SEL_OP_FENCE ||
+        insn.opcode == SEL_OP_WAIT) {
+        const uint32_t local = tracker.getIndex(0xfe);
+        const uint32_t global = tracker.getIndex(0x00);
+        tracker.addDependency(node, local, READ_AFTER_WRITE);
+        tracker.addDependency(node, global, READ_AFTER_WRITE);
+      }
+
+      // write-after-write in registers
+      for (uint32_t dstID = 0; dstID < insn.dstNum; ++dstID)
+        tracker.addDependency(node, insn.dst(dstID), WRITE_AFTER_WRITE);
+
+      // write-after-write for predicate
+      if (insn.opcode == SEL_OP_CMP || insn.opcode == SEL_OP_I64CMP || insn.state.modFlag)
+        tracker.addDependency(node, getFlag(insn), WRITE_AFTER_WRITE);
+
+      // write-after-write for accumulators
+      if (insn.state.accWrEnable)
+        tracker.addDependency(node, GenRegister::acc(), WRITE_AFTER_WRITE);
+
+      // write-after-write in memory
+      if (insn.isWrite()) {
+        const uint32_t index = tracker.getIndex(insn.getbti());
+        tracker.addDependency(node, index, WRITE_AFTER_WRITE);
+      }
+
+      // write-after-write in scratch memory
+      if (insn.opcode == SEL_OP_SPILL_REG) {
+        const uint32_t index = tracker.getIndex(0xff);
+        tracker.addDependency(node, index, WRITE_AFTER_WRITE);
+      }
+
+      // Track all writes done by the instruction
+      tracker.updateWrites(node);
+    }
+
+    // Track write-after-read dependencies
+    tracker.clear();
+    for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
+      ScheduleDAGNode *node = tracker.insnNodes[insnID];
+      const SelectionInstruction &insn = node->insn;
+
+      // write-after-read in registers
+      for (uint32_t srcID = 0; srcID < insn.srcNum; ++srcID)
+        tracker.addDependency(insn.src(srcID), node, WRITE_AFTER_READ);
+
+      // write-after-read for predicate
+      if (insn.state.predicate != GEN_PREDICATE_NONE)
+        tracker.addDependency(getFlag(insn), node, WRITE_AFTER_READ);
+
+      // write-after-read in memory
+      if (insn.isRead()) {
+        const uint32_t index = tracker.getIndex(insn.getbti());
+        tracker.addDependency(index, node, WRITE_AFTER_READ);
+      }
+
+      // write-after-read in scratch memory
+      if (insn.opcode == SEL_OP_UNSPILL_REG) {
+        const uint32_t index = tracker.getIndex(0xff);
+        tracker.addDependency(index, node, WRITE_AFTER_READ);
+      }
+
+      // Consider barriers and wait are reading memory (local and global)
+      if (insn.opcode == SEL_OP_BARRIER ||
+          insn.opcode == SEL_OP_FENCE ||
+          insn.opcode == SEL_OP_WAIT) {
+        const uint32_t local = tracker.getIndex(0xfe);
+        const uint32_t global = tracker.getIndex(0x00);
+        tracker.addDependency(local, node, WRITE_AFTER_READ);
+        tracker.addDependency(global, node, WRITE_AFTER_READ);
+      }
+
+      // Track all writes done by the instruction
+      tracker.updateWrites(node);
+    }
+
+    // Update distance to read for each read node.
+    for (int32_t insnID = 0; insnID < insnNum; ++insnID) {
+      ScheduleDAGNode *node = tracker.insnNodes[insnID];
+      const SelectionInstruction &insn = node->insn;
+      if (insn.isRead())
+        traverseReadNode(node);
+    }
+
+    // Make labels and branches non-schedulable (i.e. they act as barriers)
+    for (int32_t insnID = 0; insnID < insnNum; ++insnID) {
+      ScheduleDAGNode *node = tracker.insnNodes[insnID];
+      if (node->insn.isBranch() || node->insn.isLabel()
+          || node->insn.opcode == SEL_OP_EOT || node->insn.opcode == SEL_OP_IF
+          || node->insn.opcode == SEL_OP_BARRIER)
+        tracker.makeBarrier(insnID, insnNum);
+    }
+
+    // Build the initial ready list (should only be the label actually)
+    for (int32_t insnID = 0; insnID < insnNum; ++insnID) {
+      ScheduleDAGNode *node = tracker.insnNodes[insnID];
+      if (node->refNum == 0) {
+        ScheduleListNode *listNode = this->newScheduleListNode(node);
+        this->ready.push_back(listNode);
+      }
+    }
+
+    return insnNum;
+  }
+
+  void SelectionScheduler::preScheduleDAG(SelectionBlock &bb, int32_t insnNum) {
+    printf("Not implemented yet. \n");
+  }
+
+  void SelectionScheduler::postScheduleDAG(SelectionBlock &bb, int32_t insnNum) {
+    uint32_t cycle = 0;
+    const bool isSIMD8 = this->ctx.getSimdWidth() == 8;
+    vector <ScheduleDAGNode *> scheduledNodes;
+    while (insnNum) {
+
+      // Retire all the instructions that finished
+      //printf("cycle = %d \n", cycle);
+      for (auto toRetireIt = active.begin(); toRetireIt != active.end();) {
+        ScheduleDAGNode *toRetireNode = toRetireIt.node()->node;
+        // Firstly, put all write after read children to ready.
+        if (toRetireNode->preRetired == false) {
+          auto &children = toRetireNode->children;
+          toRetireNode->preRetired = true;
+          //printf("id %d pre retired \n", toRetireNode->insn.ID);
+          for (auto it = children.begin(); it != children.end();) {
+            ScheduleListNode *listNode = it.node();
+            if (listNode->depMode != WRITE_AFTER_READ) {
+              ++it;
+              continue;
+            }
+            if (--it->node->refNum == 0) {
+              //printf("pre push id %d to ready list. \n", listNode->node->insn.ID);
+              it = children.erase(it);
+              this->ready.push_back(listNode);
+            } else
+              ++it;
+          }
+          if (children.size() == 0) {
+            toRetireIt = this->active.erase(toRetireIt);
+            continue;
+          }
+        }
+        // Instruction is now complete
+        if (toRetireNode->retiredCycle <= cycle) {
+          toRetireIt = this->active.erase(toRetireIt);
+          //printf("id %d retired \n", toRetireNode->insn.ID);
+          // Traverse all children and make them ready if no more dependency
+          auto &children = toRetireNode->children;
+          for (auto it = children.begin(); it != children.end();) {
+            ScheduleListNode *listNode = it.node();
+            if (listNode->depMode == WRITE_AFTER_READ) {
+              ++it;
+              continue;
+            }
+            if (--it->node->refNum == 0) {
+              it = children.erase(it);
+              if (listNode->depMode != WRITE_AFTER_READ)
+                this->ready.push_back(listNode);
+              //printf("push id %d to ready list. \n", listNode->node->insn.ID);
+            } else
+              ++it;
+          }
+        } else
+          ++toRetireIt;
+      }
+
+      // Try to schedule something from the ready list
+      intrusive_list<ScheduleListNode>::iterator toSchedule;
+      toSchedule = this->ready.begin();
+      float minCost = 1000;
+      for(auto it = this->ready.begin(); it != this->ready.end(); ++it) {
+        float cost = (it->depMode == WRITE_AFTER_READ) ?  0 : ((it->depMode == WRITE_AFTER_WRITE) ? 5 : 10)
+                     - 5.0 / (it->node->readDistance == 0 ? 0.1 : it->node->readDistance);
+        if (cost < minCost) {
+          toSchedule = it;
+          minCost = cost;
+        }
+      }
+      if (toSchedule != this->ready.end()) {
+        //printf("get id %d  op %d to schedule \n", toSchedule->node->insn.ID, toSchedule->node->insn.opcode);
+        // The instruction is instantaneously issued to simulate zero cycle
+        // scheduling
+        cycle += getThroughputGen7(toSchedule->node->insn, isSIMD8);
+
+        this->ready.erase(toSchedule);
+        this->active.push_back(toSchedule.node());
+        // When we schedule before allocation, instruction is instantaneously
+        // ready. This allows to have a real LIFO strategy
+        toSchedule->node->retiredCycle = cycle + getLatencyGen7(toSchedule->node->insn);
+        bb.append(&toSchedule->node->insn);
+        scheduledNodes.push_back(toSchedule->node);
+        insnNum--;
+      } else
+        cycle++;
+    }
+  }
+
+  BVAR(OCL_POST_ALLOC_INSN_SCHEDULE, true);
+  BVAR(OCL_PRE_ALLOC_INSN_SCHEDULE, false);
+
+  void schedulePostRegAllocation(GenContext &ctx, Selection &selection) {
+    if (OCL_POST_ALLOC_INSN_SCHEDULE) {
+      SelectionScheduler scheduler(ctx, selection, POST_ALLOC);
+      for (auto &bb : *selection.blockList) {
+        const int32_t insnNum = scheduler.buildDAG(bb);
+        bb.insnList.clear();
+        scheduler.postScheduleDAG(bb, insnNum);
+      }
+    }
+  }
+
+  void schedulePreRegAllocation(GenContext &ctx, Selection &selection) {
+    if (OCL_PRE_ALLOC_INSN_SCHEDULE) {
+      SelectionScheduler scheduler(ctx, selection, PRE_ALLOC);
+      // FIXME, need to implement proper pre reg allocation scheduling algorithm.
+      return;
+      for (auto &bb : *selection.blockList) {
+        const int32_t insnNum = scheduler.buildDAG(bb);
+        bb.insnList.clear();
+        scheduler.preScheduleDAG(bb, insnNum);
+      }
+    }
+  }
+
+} /* namespace gbe */
+
diff --git a/backend/src/backend/gen_insn_scheduling.hpp b/backend/src/backend/gen_insn_scheduling.hpp
new file mode 100644
index 0000000..534557d
--- /dev/null
+++ b/backend/src/backend/gen_insn_scheduling.hpp
@@ -0,0 +1,42 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_insn_scheduling.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_GEN_INSN_SCHEDULING_HPP__
+#define __GBE_GEN_INSN_SCHEDULING_HPP__
+
+namespace gbe
+{
+  class Selection;  // Pre ISA code
+  class GenContext; // Handle compilation for Gen
+
+  /*! Schedule the code per basic block (tends to limit register number) */
+  void schedulePreRegAllocation(GenContext &ctx, Selection &selection);
+
+  /*! Schedule the code per basic block (tends to deal with insn latency) */
+  void schedulePostRegAllocation(GenContext &ctx, Selection &selection);
+
+} /* namespace gbe */
+
+#endif /* __GBE_GEN_INSN_SCHEDULING_HPP__ */
+
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
new file mode 100644
index 0000000..96d3965
--- /dev/null
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -0,0 +1,4032 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_insn_selection.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/* This is the instruction selection code. First of all, this is a bunch of c++
+ * crap. Sorry if this is not that readable. Anyway, the goal here is to take
+ * GenIR code (i.e. the very regular, very RISC IR) and to produce GenISA with
+ * virtual registers (i.e. regular GenIR registers).
+ *
+ * Overall idea:
+ * =============
+ *
+ * There is a lot of papers and research about that but I tried to keep it
+ * simple. No dynamic programming, nothing like this. Just a recursive maximal
+ * munch.
+ *
+ * Basically, the code is executed per basic block from bottom to top. Patterns
+ * of GenIR instructions are defined and each instruction is matched against the
+ * best pattern i.e. the pattern that catches the largest number of
+ * instructions. Once matched, a sequence of instructions is output.
+ *
+ * Each instruction the match depends on is then marked as "root" i.e. we
+ * indicate that each of these instructions must be generated: we indeed need their
+ * destinations for the next instructions (remember that we generate the code in
+ * reverse order)
+ *
+ * Patterns:
+ * =========
+ *
+ * There is a lot of patterns and I did not implement all of them obviously. I
+ * just quickly gather the complete code to make pattern implementation kind of
+ * easy. This is pretty verbose to add a pattern but it should be not too hard
+ * to add new ones.
+ *
+ * To create and register patterns, I just abused C++ pre-main. A bunch of
+ * patterns is then created and sorted per opcode (i.e. the opcode of the root
+ * of the pattern): this creates a library of patterns that may be used in
+ * run-time.
+ *
+ * Predication / Masking and CFG linearization
+ * ===========================================
+ *
+ * The current version is based on an unfortunate choice. Basically, the problem
+ * to solve is how to map unstructured branches (i.e. regular gotos) onto Gen.
+ * Gen has a native support for structured branches (if/else/endif/while...) but
+ * nothing really native for unstructured branches.
+ *
+ * The idea we implemented is simple. We stole one flag register (here f0.0) to
+ * mask all the instructions (and only activate the proper SIMD lanes) and we
+ * use the CFG linearization technique to properly handle the control flow. This
+ * is not really good for one particular reason: Gen instructions must use the
+ * *same* flag register for the predicates (used for masking) and the
+ * conditional modifier (used as a destination for CMP). This leads to extra
+ * complications with compare instructions and select instructions. Basically,
+ * we need to insert extra MOVs.
+ *
+ * Also, there is some extra kludge to handle the predicates for JMPI.
+ *
+ * TODO:
+ * =====
+ *
+ * Sadly, I recreated here a new DAG class. This is just a bad idea since we
+ * already have the DAG per basic block with the Function graph i.e. the
+ * complete graph of uses and definitions. I think we should be able to save a
+ * lot of code here if we can simply reuse the code from UD / DU chains.
+ *
+ * Finally, cross-block instruction selection is quite possible with this simple
+ * approach. Basically, instructions from dominating blocks could be merged and
+ * matched with other instructions in the dominated block. This leads to the
+ * interesting approach which consists in traversing the dominator tree in post
+ * order
+ *
+ * We already use if/endif to enclose each basic block. We will continue to identify
+ * those blocks which could match to structured branching and use pure structured
+ * instruction to handle them completely.
+ */
+
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_context.hpp"
+#include "ir/function.hpp"
+#include "ir/liveness.hpp"
+#include "ir/profile.hpp"
+#include "sys/cvar.hpp"
+#include "sys/vector.hpp"
+#include <algorithm>
+#include <climits>
+
+namespace gbe
+{
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Helper functions
+  ///////////////////////////////////////////////////////////////////////////
+
+  uint32_t getGenType(ir::Type type) {
+    using namespace ir;
+    switch (type) {
+      case TYPE_BOOL: return GEN_TYPE_UW;
+      case TYPE_S8: return GEN_TYPE_B;
+      case TYPE_U8: return GEN_TYPE_UB;
+      case TYPE_S16: return GEN_TYPE_W;
+      case TYPE_U16: return GEN_TYPE_UW;
+      case TYPE_S32: return GEN_TYPE_D;
+      case TYPE_U32: return GEN_TYPE_UD;
+      case TYPE_S64: return GEN_TYPE_L;
+      case TYPE_U64: return GEN_TYPE_UL;
+      case TYPE_FLOAT: return GEN_TYPE_F;
+      case TYPE_DOUBLE: return GEN_TYPE_DF;
+      default: NOT_SUPPORTED; return GEN_TYPE_F;
+    }
+  }
+
+  ir::Type getIRType(uint32_t genType) {
+    using namespace ir;
+    switch (genType) {
+      case GEN_TYPE_B: return TYPE_S8;
+      case GEN_TYPE_UB: return TYPE_U8;
+      case GEN_TYPE_W: return TYPE_S16;
+      case GEN_TYPE_UW: return TYPE_U16;
+      case GEN_TYPE_D: return TYPE_S32;
+      case GEN_TYPE_UD: return TYPE_U32;
+      case GEN_TYPE_L: return TYPE_S64;
+      case GEN_TYPE_UL: return TYPE_U64;
+      case GEN_TYPE_F: return TYPE_FLOAT;
+      case GEN_TYPE_DF: return TYPE_DOUBLE;
+      default: NOT_SUPPORTED; return TYPE_FLOAT;
+    }
+  }
+
+  uint32_t getGenCompare(ir::Opcode opcode, bool inverse = false) {
+    using namespace ir;
+    switch (opcode) {
+      case OP_LE: return (!inverse) ? GEN_CONDITIONAL_LE : GEN_CONDITIONAL_G;
+      case OP_LT: return (!inverse) ? GEN_CONDITIONAL_L : GEN_CONDITIONAL_GE;
+      case OP_GE: return (!inverse) ? GEN_CONDITIONAL_GE : GEN_CONDITIONAL_L;
+      case OP_GT: return (!inverse) ? GEN_CONDITIONAL_G : GEN_CONDITIONAL_LE;
+      case OP_EQ: return (!inverse) ? GEN_CONDITIONAL_EQ : GEN_CONDITIONAL_NEQ;
+      case OP_NE: return (!inverse) ? GEN_CONDITIONAL_NEQ : GEN_CONDITIONAL_EQ;
+      default: NOT_SUPPORTED; return 0u;
+    };
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // SelectionInstruction
+  ///////////////////////////////////////////////////////////////////////////
+
+  SelectionInstruction::SelectionInstruction(SelectionOpcode op, uint32_t dst, uint32_t src) :
+    parent(NULL), opcode(op), dstNum(dst), srcNum(src)
+  {
+    extra.function = 0;
+  }
+
+  void SelectionInstruction::prepend(SelectionInstruction &other) {
+    gbe::prepend(&other, this);
+    other.parent = this->parent;
+  }
+
+  void SelectionInstruction::append(SelectionInstruction &other) {
+    gbe::append(&other, this);
+    other.parent = this->parent;
+  }
+
+  bool SelectionInstruction::isRead(void) const {
+    return this->opcode == SEL_OP_UNTYPED_READ ||
+           this->opcode == SEL_OP_READ64       ||
+           this->opcode == SEL_OP_ATOMIC       ||
+           this->opcode == SEL_OP_BYTE_GATHER  ||
+           this->opcode == SEL_OP_SAMPLE ||
+           this->opcode == SEL_OP_DWORD_GATHER;
+  }
+
+  bool SelectionInstruction::isWrite(void) const {
+    return this->opcode == SEL_OP_UNTYPED_WRITE ||
+           this->opcode == SEL_OP_WRITE64       ||
+           this->opcode == SEL_OP_ATOMIC        ||
+           this->opcode == SEL_OP_BYTE_SCATTER  ||
+           this->opcode == SEL_OP_TYPED_WRITE;
+  }
+
+  bool SelectionInstruction::isBranch(void) const {
+    return this->opcode == SEL_OP_JMPI;
+  }
+
+  bool SelectionInstruction::isLabel(void) const {
+    return this->opcode == SEL_OP_LABEL;
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // SelectionVector
+  ///////////////////////////////////////////////////////////////////////////
+
+  SelectionVector::SelectionVector(void) :
+    insn(NULL), reg(NULL), regNum(0), isSrc(0)
+  {}
+
+  ///////////////////////////////////////////////////////////////////////////
+  // SelectionBlock
+  ///////////////////////////////////////////////////////////////////////////
+
+  SelectionBlock::SelectionBlock(const ir::BasicBlock *bb) : bb(bb), isLargeBlock(false), endifLabel( (ir::LabelIndex) 0){}
+
+  void SelectionBlock::append(ir::Register reg) { tmp.push_back(reg); }
+
+  void SelectionBlock::append(SelectionInstruction *insn) {
+    this->insnList.push_back(insn);
+    insn->parent = this;
+  }
+
+  void SelectionBlock::prepend(SelectionInstruction *insn) {
+    this->insnList.push_front(insn);
+    insn->parent = this;
+  }
+
+  void SelectionBlock::append(SelectionVector *vec) {
+    this->vectorList.push_back(vec);
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Maximal munch selection on DAG
+  ///////////////////////////////////////////////////////////////////////////
+
+  /*! All instructions in a block are organized into a DAG */
+  class SelectionDAG
+  {
+  public:
+    INLINE SelectionDAG(const ir::Instruction &insn) :
+      insn(insn), mergeable(0), childNum(insn.getSrcNum()), isRoot(0) {
+      GBE_ASSERT(insn.getSrcNum() < 127);
+      for (uint32_t childID = 0; childID < childNum; ++childID)
+        this->child[childID] = NULL;
+      computeBool = false;
+      isUsed = false;
+    }
+    /*! Mergeable are non-root instructions with valid sources */
+    INLINE void setAsMergeable(uint32_t which) { mergeable|=(1<<which); }
+    /*! Mergeable are non-root instructions with valid sources */
+    INLINE bool isMergeable(uint32_t which) const { return mergeable&(1<<which); }
+    /*! Children that need to be matched */
+    SelectionDAG *child[ir::Instruction::MAX_SRC_NUM];
+    /*! Instruction that needs to be matched */
+    const ir::Instruction &insn;
+    /*! When sources have been overwritten, a child insn cannot be merged */
+    uint32_t mergeable:ir::Instruction::MAX_SRC_NUM;
+    /*! Number of children we have in the pattern */
+    uint32_t childNum:7;
+    /*! A root must be generated, no matter what */
+    uint32_t isRoot:1;
+    /*! A bool register is used as normal computing sources. */
+    bool computeBool;
+    /*! is used in this block */
+    bool isUsed;
+  };
+
+  /*! A pattern is a tree to match. This is the general interface for them. For
+   *  pattern to be matched, we need to match the complete tree i.e. this node
+   *  and its child nodes
+   */
+  class SelectionPattern
+  {
+  public:
+    SelectionPattern(uint32_t insnNum, uint32_t cost) :
+      insnNum(insnNum), cost(cost) {}
+    /*! This is an abstract class */
+    virtual ~SelectionPattern(void) {}
+    /*! Emit Gen code in the selection. Return false if no match */
+    virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const = 0;
+    /*! All the possible opcodes for this pattern (for fast sort) */
+    vector<ir::Opcode> opcodes;
+    /*! Number of instruction generated */
+    uint32_t insnNum;
+    /*! Cost of the pattern */
+    uint32_t cost;
+  };
+
+  /*! Store and sort all the patterns. This is our global library we use for the
+   *  code selection
+   */
+  class SelectionLibrary
+  {
+  public:
+    /*! Will register all the patterns */
+    SelectionLibrary(void);
+    /*! Release and destroy all the registered patterns */
+    ~SelectionLibrary(void);
+    /*! Insert the given pattern for all associated opcodes */
+    template <typename PatternType> void insert(void);
+    /*! One list of pattern per opcode */
+    typedef vector<const SelectionPattern*> PatternList;
+    /*! All lists of patterns properly sorted per opcode */
+    PatternList patterns[ir::OP_INVALID];
+    /*! All patterns to free */
+    vector<const SelectionPattern*> toFree;
+  };
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Code selection internal implementation
+  ///////////////////////////////////////////////////////////////////////////
+
+  /*! Actual implementation of the instruction selection engine */
+  class Selection::Opaque
+  {
+  public:
+    /*! simdWidth is the default width for the instructions */
+    Opaque(GenContext &ctx);
+    /*! Release everything */
+    virtual ~Opaque(void);
+    /*! Implements the instruction selection itself */
+    void select(void);
+    /*! Start a backward generation (from the end of the block) */
+    void startBackwardGeneration(void);
+    /*! End backward code generation and output the code in the block */
+    void endBackwardGeneration(void);
+    /*! Implement public class */
+    uint32_t getLargestBlockSize(void) const;
+    /*! Implement public class */
+    INLINE uint32_t getVectorNum(void) const { return this->vectorNum; }
+    /*! Implement public class */
+    INLINE ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov);
+    /*! Implement public class */
+    INLINE ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov);
+    /*! spill a register (insert spill/unspill instructions) */
+    INLINE bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
+    /*! should add per thread offset to the local memory address when load/store/atomic */
+    bool needPatchSLMAddr() const { return patchSLMAddr; }
+    void setPatchSLMAddr(bool b) { patchSLMAddr = b; }
+    /*! indicate whether a register is a scalar/uniform register. */
+    INLINE bool isScalarReg(const ir::Register &reg) const {
+      const ir::RegisterData &regData = getRegisterData(reg);
+      return regData.isUniform();
+    }
+
+    INLINE GenRegister unpacked_uw(const ir::Register &reg) const {
+      return GenRegister::unpacked_uw(reg, isScalarReg(reg));
+    }
+
+    INLINE GenRegister unpacked_ub(const ir::Register &reg) const {
+      return GenRegister::unpacked_ub(reg, isScalarReg(reg));
+    }
+    /*! Implement public class */
+    INLINE uint32_t getRegNum(void) const { return file.regNum(); }
+    /*! Implements public interface */
+    INLINE ir::RegisterData getRegisterData(ir::Register reg) const {
+      return file.get(reg);
+    }
+    /*! Implement public class */
+    INLINE ir::RegisterFamily getRegisterFamily(ir::Register reg) const {
+      return file.get(reg).family;
+    }
+    /*! Implement public class */
+    SelectionInstruction *create(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
+    /*! Return the selection register from the GenIR one */
+    GenRegister selReg(ir::Register, ir::Type type = ir::TYPE_FLOAT) const;
+    /*! Compute the nth register part when using SIMD8 with Qn (n in 2,3,4) */
+    GenRegister selRegQn(ir::Register, uint32_t quarter, ir::Type type = ir::TYPE_FLOAT) const;
+    /*! Size of the stack (should be large enough) */
+    enum { MAX_STATE_NUM = 16 };
+    /*! Push the current instruction state */
+    INLINE void push(void) {
+      assert(stateNum < MAX_STATE_NUM);
+      stack[stateNum++] = curr;
+    }
+    /*! Pop the latest pushed state */
+    INLINE void pop(void) {
+      assert(stateNum > 0);
+      curr = stack[--stateNum];
+    }
+    /*! Create a new register in the register file and append it in the
+     *  temporary list of the current block
+     */
+    INLINE ir::Register reg(ir::RegisterFamily family, bool scalar = false) {
+      GBE_ASSERT(block != NULL);
+      const ir::Register reg = file.append(family, scalar);
+      block->append(reg);
+      return reg;
+    }
+    /*! Append a block at the block stream tail. It becomes the current block */
+    void appendBlock(const ir::BasicBlock &bb);
+    /*! Append an instruction in the current block */
+    SelectionInstruction *appendInsn(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
+    /*! Append a new vector of registers in the current block */
+    SelectionVector *appendVector(void);
+    /*! Build a DAG for the basic block (return number of instructions) */
+    uint32_t buildBasicBlockDAG(const ir::BasicBlock &bb);
+    /*! Perform the selection on the basic block */
+    void matchBasicBlock(const ir::BasicBlock &bb, uint32_t insnNum);
+    /*! A root instruction needs to be generated */
+    bool isRoot(const ir::Instruction &insn) const;
+
+    /*! To handle selection block allocation */
+    DECL_POOL(SelectionBlock, blockPool);
+    /*! To handle selection instruction allocation */
+    LinearAllocator insnAllocator;
+    /*! To handle selection vector allocation */
+    DECL_POOL(SelectionVector, vecPool);
+    /*! Per register information used with top-down block sweeping */
+    vector<SelectionDAG*> regDAG;
+    /*! Store one DAG per instruction */
+    vector<SelectionDAG*> insnDAG;
+    /*! Owns this structure */
+    GenContext &ctx;
+    /*! Tail of the code fragment for backward code generation */
+    intrusive_list<SelectionInstruction> bwdList;
+    /*! List of emitted blocks */
+    intrusive_list<SelectionBlock> blockList;
+    /*! Currently processed block */
+    SelectionBlock *block;
+    /*! Current instruction state to use */
+    GenInstructionState curr;
+    /*! We append new registers so we duplicate the function register file */
+    ir::RegisterFile file;
+    /*! State used to encode the instructions */
+    GenInstructionState stack[MAX_STATE_NUM];
+    /*! Maximum number of instructions in the basic blocks */
+    uint32_t maxInsnNum;
+    /*! Speed up instruction dag allocation */
+    DECL_POOL(SelectionDAG, dagPool);
+    /*! Total number of registers in the function we encode */
+    uint32_t regNum;
+    /*! Number of states currently pushed */
+    uint32_t stateNum;
+    /*! Number of vector allocated */
+    uint32_t vectorNum;
+    /*! If true, generate code backward */
+    bool bwdCodeGeneration;
+    /*! To make function prototypes more readable */
+    typedef const GenRegister &Reg;
+
+#define ALU1(OP) \
+  INLINE void OP(Reg dst, Reg src) { ALU1(SEL_OP_##OP, dst, src); }
+#define ALU1WithTemp(OP) \
+  INLINE void OP(Reg dst, Reg src, Reg temp) { ALU1WithTemp(SEL_OP_##OP, dst, src, temp); }
+#define ALU2(OP) \
+  INLINE void OP(Reg dst, Reg src0, Reg src1) { ALU2(SEL_OP_##OP, dst, src0, src1); }
+#define ALU2WithTemp(OP) \
+  INLINE void OP(Reg dst, Reg src0, Reg src1, Reg temp) { ALU2WithTemp(SEL_OP_##OP, dst, src0, src1, temp); }
+#define ALU3(OP) \
+  INLINE void OP(Reg dst, Reg src0, Reg src1, Reg src2) { ALU3(SEL_OP_##OP, dst, src0, src1, src2); }
+#define I64Shift(OP) \
+  INLINE void OP(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) { I64Shift(SEL_OP_##OP, dst, src0, src1, tmp); }
+    ALU1(MOV)
+    ALU1WithTemp(MOV_DF)
+    ALU1WithTemp(LOAD_DF_IMM)
+    ALU1(LOAD_INT64_IMM)
+    ALU1(RNDZ)
+    ALU1(RNDE)
+    ALU1(F16TO32)
+    ALU1(F32TO16)
+    ALU2(SEL)
+    ALU2(SEL_INT64)
+    ALU1(NOT)
+    ALU2(AND)
+    ALU2(OR)
+    ALU2(XOR)
+    ALU2(I64AND)
+    ALU2(I64OR)
+    ALU2(I64XOR)
+    ALU2(SHR)
+    ALU2(SHL)
+    ALU2(RSR)
+    ALU2(RSL)
+    ALU2(ASR)
+    ALU2(ADD)
+    ALU2WithTemp(I64ADD)
+    ALU2WithTemp(I64SUB)
+    ALU2(MUL)
+    ALU1(FRC)
+    ALU1(RNDD)
+    ALU1(RNDU)
+    ALU2(MACH)
+    ALU1(LZD)
+    ALU3(MAD)
+    ALU2WithTemp(MUL_HI)
+    ALU1(FBH)
+    ALU1(FBL)
+    ALU2WithTemp(HADD)
+    ALU2WithTemp(RHADD)
+    ALU2(UPSAMPLE_SHORT)
+    ALU2(UPSAMPLE_INT)
+    ALU2(UPSAMPLE_LONG)
+    ALU1WithTemp(CONVI_TO_I64)
+    ALU1WithTemp(CONVF_TO_I64)
+    ALU1(CONVI64_TO_I)
+    I64Shift(I64SHL)
+    I64Shift(I64SHR)
+    I64Shift(I64ASR)
+#undef ALU1
+#undef ALU1WithTemp
+#undef ALU2
+#undef ALU2WithTemp
+#undef ALU3
+#undef I64Shift
+    /*! Convert 64-bit integer to 32-bit float */
+    void CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[6]);
+    /*! Convert 64-bit integer to 32-bit float */
+    void CONVF_TO_I64(Reg dst, Reg src, GenRegister tmp[2]);
+    /*! Saturated 64bit x*y + z */
+    void I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[9]);
+    /*! High 64bit of x*y */
+    void I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[9]);
+    /*! (x+y)>>1 without mod. overflow */
+    void I64HADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]);
+    /*! (x+y+1)>>1 without mod. overflow */
+    void I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]);
+    /*! Shift a 64-bit integer */
+    void I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[7]);
+    /*! Compare 64-bit integer */
+    void I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]);
+    /*! Saturated addition of 64-bit integer */
+    void I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]);
+    /*! Saturated subtraction of 64-bit integer */
+    void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]);
+    /*! Encode a barrier instruction */
+    void BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType);
+    /*! Encode a barrier instruction */
+    void FENCE(GenRegister dst);
+    /*! Encode a label instruction */
+    void LABEL(ir::LabelIndex label);
+    /*! Jump indexed instruction, return the encoded instruction count according to jump distance. */
+    int JMPI(Reg src, ir::LabelIndex target, ir::LabelIndex origin);
+    /*! IF indexed instruction */
+    void IF(Reg src, ir::LabelIndex jip, ir::LabelIndex uip);
+    /*! ENDIF indexed instruction */
+    void ENDIF(Reg src, ir::LabelIndex jip);
+    /*! BRD indexed instruction */
+    void BRD(Reg src, ir::LabelIndex jip);
+    /*! BRC indexed instruction */
+    void BRC(Reg src, ir::LabelIndex jip, ir::LabelIndex uip);
+    /*! Compare instructions */
+    void CMP(uint32_t conditional, Reg src0, Reg src1, Reg dst = GenRegister::null());
+    /*! Select instruction with embedded comparison */
+    void SEL_CMP(uint32_t conditional, Reg dst, Reg src0, Reg src1);
+    /* Constant buffer move instruction */
+    void INDIRECT_MOVE(Reg dst, Reg src);
+    /*! EOT is used to finish GPGPU threads */
+    void EOT(void);
+    /*! No-op */
+    void NOP(void);
+    /*! Wait instruction (used for the barrier) */
+    void WAIT(void);
+    /*! Atomic instruction */
+    void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg src1, Reg src2, uint32_t bti);
+    /*! Read 64 bits float/int array */
+    void READ64(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+    /*! Write 64 bits float/int array */
+    void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, uint32_t bti);
+    /*! Untyped read (up to 4 elements) */
+    void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, uint32_t bti);
+    /*! Untyped write (up to 4 elements) */
+    void UNTYPED_WRITE(Reg addr, const GenRegister *src, uint32_t elemNum, uint32_t bti);
+    /*! Byte gather (for unaligned bytes, shorts and ints) */
+    void BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti);
+    /*! Byte scatter (for unaligned bytes, shorts and ints) */
+    void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti);
+    /*! DWord scatter (for constant cache read) */
+    void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
+    /*! Unpack the uint to char4 */
+    void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum);
+    /*! pack the char4 to uint */
+    void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum);
+    /*! Extended math function (2 arguments) */
+    void MATH(Reg dst, uint32_t function, Reg src0, Reg src1);
+    /*! Extended math function (1 argument) */
+    void MATH(Reg dst, uint32_t function, Reg src);
+    /*! Encode unary instructions */
+    void ALU1(SelectionOpcode opcode, Reg dst, Reg src);
+    /*! Encode unary with temp reg instructions */
+    void ALU1WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg temp);
+    /*! Encode binary instructions */
+    void ALU2(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1);
+    /*! Encode binary with temp reg instructions */
+    void ALU2WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg temp);
+    /*! Encode ternary instructions */
+    void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
+    /*! Encode sample instructions */
+    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool isLD, bool isUniform);
+    /*! Encode typed write instructions */
+    void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
+    /*! Get image information */
+    void GET_IMAGE_INFO(uint32_t type, GenRegister *dst, uint32_t dst_num, uint32_t bti);
+    /*! Multiply 64-bit integers */
+    void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]);
+    /*! 64-bit integer division */
+    void I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]);
+    /*! 64-bit integer remainder of division */
+    void I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]);
+    /* common functions for both binary instruction and sel_cmp and compare instruction.
+       It will handle the IMM or normal register assignment, and will try to avoid LOADI
+       as much as possible. */
+    void getSrcGenRegImm(SelectionDAG &dag, GenRegister &src0,
+                      GenRegister &src1, ir::Type type, bool &inverse);
+    void getSrcGenRegImm(SelectionDAG &dag,
+                      SelectionDAG *dag0, SelectionDAG *dag1,
+                      GenRegister &src0, GenRegister &src1,
+                      ir::Type type, bool &inverse);
+    /*! Use custom allocators */
+    GBE_CLASS(Opaque);
+    friend class SelectionBlock;
+    friend class SelectionInstruction;
+  private:
+    /*! Auxiliary label for if/endif. */ 
+    uint16_t currAuxLabel;
+    bool patchSLMAddr;
+    INLINE ir::LabelIndex newAuxLabel()
+    {
+      currAuxLabel++;
+      return (ir::LabelIndex)currAuxLabel;
+    }
+
+  };
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Helper function
+  ///////////////////////////////////////////////////////////////////////////
+
+  /*! Directly mark all sources as root (when no match is found) */
+  static void markAllChildren(SelectionDAG &dag) {
+    // Do not merge anything, so all sources become roots
+    for (uint32_t childID = 0; childID < dag.childNum; ++childID)
+      if (dag.child[childID])
+        dag.child[childID]->isRoot = 1;
+  }
+
+  /*! Helper function to figure if two sources are the same */
+  static bool sourceMatch(SelectionDAG *src0DAG, uint32_t src0ID,
+                          SelectionDAG *src1DAG, uint32_t src1ID)
+  {
+    GBE_ASSERT(src0DAG && src1DAG);
+    // Ensure they are the same physical registers
+    const ir::Register src0 = src0DAG->insn.getSrc(src0ID);
+    const ir::Register src1 = src1DAG->insn.getSrc(src1ID);
+    if (src0 != src1)
+      return false;
+    // Ensure they contain the same values
+    return src0DAG->child[src0ID] == src1DAG->child[src1ID];
+  }
+
+  Selection::Opaque::Opaque(GenContext &ctx) :
+    ctx(ctx), block(NULL),
+    curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
+    maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
+    stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()), patchSLMAddr(false)
+  {
+    const ir::Function &fn = ctx.getFunction();
+    this->regNum = fn.regNum();
+    this->regDAG.resize(regNum);
+    this->insnDAG.resize(maxInsnNum);
+  }
+
+  Selection::Opaque::~Opaque(void) {
+    for (auto it = blockList.begin(); it != blockList.end();) {
+      SelectionBlock &block = *it;
+      ++it;
+      this->deleteSelectionBlock(&block);
+    }
+  }
+
+  SelectionInstruction*
+  Selection::Opaque::create(SelectionOpcode opcode, uint32_t dstNum, uint32_t srcNum)
+  {
+    const size_t regSize =  (dstNum+srcNum)*sizeof(GenRegister);
+    const size_t size = sizeof(SelectionInstruction) + regSize;
+    void *ptr = insnAllocator.allocate(size);
+    return new (ptr) SelectionInstruction(opcode, dstNum, srcNum);
+  }
+
+  void Selection::Opaque::startBackwardGeneration(void) {
+    this->bwdCodeGeneration = true;
+  }
+
+  void Selection::Opaque::endBackwardGeneration(void) {
+    for (auto it = bwdList.rbegin(); it != bwdList.rend();) {
+      SelectionInstruction &insn = *it;
+      auto toRemoveIt = it--;
+      bwdList.erase(toRemoveIt);
+      this->block->prepend(&insn);
+    }
+
+    this->bwdCodeGeneration = false;
+  }
+
+  uint32_t Selection::Opaque::getLargestBlockSize(void) const {
+    size_t maxInsnNum = 0;
+    for (const auto &bb : blockList)
+      maxInsnNum = std::max(maxInsnNum, bb.insnList.size());
+    return uint32_t(maxInsnNum);
+  }
+
+  void Selection::Opaque::appendBlock(const ir::BasicBlock &bb) {
+    this->block = this->newSelectionBlock(&bb);
+    this->blockList.push_back(this->block);
+  }
+
+  SelectionInstruction *Selection::Opaque::appendInsn(SelectionOpcode opcode,
+                                                      uint32_t dstNum,
+                                                      uint32_t srcNum)
+  {
+    GBE_ASSERT(dstNum <= SelectionInstruction::MAX_DST_NUM && srcNum <= SelectionInstruction::MAX_SRC_NUM);
+    GBE_ASSERT(this->block != NULL);
+    SelectionInstruction *insn = this->create(opcode, dstNum, srcNum);
+    if (this->bwdCodeGeneration)
+      this->bwdList.push_back(insn);
+    else
+      this->block->append(insn);
+    insn->state = this->curr;
+    return insn;
+  }
+
+  SelectionVector *Selection::Opaque::appendVector(void) {
+    GBE_ASSERT(this->block != NULL);
+    SelectionVector *vector = this->newSelectionVector();
+
+    if (this->bwdCodeGeneration)
+      vector->insn = this->bwdList.back();
+    else
+      vector->insn = this->block->insnList.back();
+    this->block->append(vector);
+    this->vectorNum++;
+    return vector;
+  }
+
+  bool Selection::Opaque::spillRegs(const SpilledRegs &spilledRegs,
+                                    uint32_t registerPool) {
+    GBE_ASSERT(registerPool != 0);
+
+    for (auto &block : blockList)
+      for (auto &insn : block.insnList) {
+        // spill / unspill insn should be skipped when do spilling
+        if(insn.opcode == SEL_OP_SPILL_REG
+           || insn.opcode == SEL_OP_UNSPILL_REG)
+          continue;
+        const int simdWidth = insn.state.execWidth;
+
+        const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
+        struct RegSlot {
+          RegSlot(ir::Register _reg, uint8_t _srcID,
+                   uint8_t _poolOffset, bool _isTmp, uint32_t _addr)
+                 : reg(_reg), srcID(_srcID), poolOffset(_poolOffset), isTmpReg(_isTmp), addr(_addr)
+          {};
+          ir::Register reg;
+          union {
+            uint8_t srcID;
+            uint8_t dstID;
+          };
+          uint8_t poolOffset;
+          bool isTmpReg;
+          int32_t addr;
+        };
+        uint8_t poolOffset = 1; // keep one for scratch message header
+        vector <struct RegSlot> regSet;
+        for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+          const GenRegister selReg = insn.src(srcID);
+          const ir::Register reg = selReg.reg();
+          auto it = spilledRegs.find(reg);
+          if(it != spilledRegs.end()
+             && selReg.file == GEN_GENERAL_REGISTER_FILE
+             && selReg.physical == 0) {
+            ir::RegisterFamily family = getRegisterFamily(reg);
+            if(family == ir::FAMILY_QWORD && poolOffset == 1) {
+              poolOffset += simdWidth / 8; // qword register fill could not share the scratch read message payload register
+            }
+            struct RegSlot regSlot(reg, srcID, poolOffset,
+                                   it->second.isTmpReg,
+                                   it->second.addr);
+            if(family == ir::FAMILY_QWORD) {
+              poolOffset += 2 * simdWidth / 8;
+            } else {
+              poolOffset += simdWidth / 8;
+            }
+            regSet.push_back(regSlot);
+          }
+        }
+
+        if (poolOffset > ctx.reservedSpillRegs) {
+          if (GBE_DEBUG)
+            std::cerr << "Instruction (#" << (uint32_t)insn.opcode
+                      << ") src too large pooloffset "
+                      << (uint32_t)poolOffset << std::endl;
+          return false;
+        }
+        // FIXME, to support post register allocation scheduling,
+        // put all the reserved register to the spill/unspill's destination registers.
+        // This is not the best way. We need to refine the spill/unspill instruction to
+        // only use passed in registers and don't access hard coded offset in the future.
+        while(!regSet.empty()) {
+          struct RegSlot regSlot = regSet.back();
+          regSet.pop_back();
+          const GenRegister selReg = insn.src(regSlot.srcID);
+          if (!regSlot.isTmpReg) {
+          /* For temporary registers, we don't need to unspill. */
+            SelectionInstruction *unspill = this->create(SEL_OP_UNSPILL_REG,
+                                            1 + (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth(), 0);
+            unspill->state = GenInstructionState(simdWidth);
+            unspill->state.noMask = 1;
+            unspill->dst(0) = GenRegister(GEN_GENERAL_REGISTER_FILE,
+                                          registerPool + regSlot.poolOffset, 0,
+                                          selReg.type, selReg.vstride,
+                                          selReg.width, selReg.hstride);
+            for(uint32_t i = 1; i < 1 + (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth(); i++)
+              unspill->dst(i) = ctx.getSimdWidth() == 8 ?
+                                GenRegister::vec8(GEN_GENERAL_REGISTER_FILE, registerPool + (i - 1), 0 ) :
+                                GenRegister::vec16(GEN_GENERAL_REGISTER_FILE, registerPool + (i - 1) * 2, 0);
+            unspill->extra.scratchOffset = regSlot.addr + selReg.quarter * 4 * simdWidth;
+            unspill->extra.scratchMsgHeader = registerPool;
+            insn.prepend(*unspill);
+          }
+
+          GenRegister src = insn.src(regSlot.srcID);
+          // change nr/subnr, keep other register settings
+          src.nr = registerPool + regSlot.poolOffset; src.subnr = 0; src.physical = 1;
+          insn.src(regSlot.srcID) = src;
+        };
+
+        /*
+          To save one register, registerPool + 1 was used by both
+          the src0 as source and other operands as payload. To avoid
+          side effect, we use a stack model to push all operands
+          register, and spill the 0th dest at last. As all the spill
+          will be append to the current instruction. Then the last spill
+          instruction will be the first instruction after current
+          instruction. Thus the registerPool + 1 still contain valid
+          data.
+         */
+        for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+          const GenRegister selReg = insn.dst(dstID);
+          const ir::Register reg = selReg.reg();
+          auto it = spilledRegs.find(reg);
+          if(it != spilledRegs.end()
+             && selReg.file == GEN_GENERAL_REGISTER_FILE
+             && selReg.physical == 0) {
+            ir::RegisterFamily family = getRegisterFamily(reg);
+            if(family == ir::FAMILY_QWORD && poolOffset == 1) {
+              poolOffset += simdWidth / 8; // qword register spill could not share the scratch write message payload register
+            }
+            struct RegSlot regSlot(reg, dstID, poolOffset,
+                                   it->second.isTmpReg,
+                                   it->second.addr);
+            if (family == ir::FAMILY_QWORD) poolOffset += 2 * simdWidth / 8;
+            else poolOffset += simdWidth / 8;
+            regSet.push_back(regSlot);
+          }
+        }
+
+        if (poolOffset > ctx.reservedSpillRegs){
+          if (GBE_DEBUG)
+           std::cerr << "Instruction (#" << (uint32_t)insn.opcode
+                     << ") dst too large pooloffset "
+                     << (uint32_t)poolOffset << std::endl;
+          return false;
+        }
+        while(!regSet.empty()) {
+          struct RegSlot regSlot = regSet.back();
+          regSet.pop_back();
+          const GenRegister selReg = insn.dst(regSlot.dstID);
+          if(!regSlot.isTmpReg) {
+            /* For temporary registers, we don't need to unspill. */
+            SelectionInstruction *spill = this->create(SEL_OP_SPILL_REG,
+                                          (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth() , 1);
+            spill->state  = insn.state;//GenInstructionState(simdWidth);
+            spill->state.accWrEnable = 0;
+            spill->state.saturate = 0;
+            if (insn.opcode == SEL_OP_SEL)
+              spill->state.predicate = GEN_PREDICATE_NONE;
+            spill->src(0) = GenRegister(GEN_GENERAL_REGISTER_FILE,
+                                        registerPool + regSlot.poolOffset, 0,
+                                        selReg.type, selReg.vstride,
+                                        selReg.width, selReg.hstride);
+            spill->extra.scratchOffset = regSlot.addr + selReg.quarter * 4 * simdWidth;
+            spill->extra.scratchMsgHeader = registerPool;
+            for(uint32_t i = 0; i < 0 + (ctx.reservedSpillRegs * 8) / ctx.getSimdWidth(); i++)
+              spill->dst(i) = ctx.getSimdWidth() == 8 ?
+                                GenRegister::vec8(GEN_GENERAL_REGISTER_FILE, registerPool + (i), 0 ) :
+                                GenRegister::vec16(GEN_GENERAL_REGISTER_FILE, registerPool + (i) * 2, 0);
+            insn.append(*spill);
+          }
+
+          GenRegister dst = insn.dst(regSlot.dstID);
+          // change nr/subnr, keep other register settings
+          dst.physical =1; dst.nr = registerPool + regSlot.poolOffset; dst.subnr = 0;
+          insn.dst(regSlot.dstID)= dst;
+        }
+      }
+    return true;
+  }
+
+  ir::Register Selection::Opaque::replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) {
+    SelectionBlock *block = insn->parent;
+    const uint32_t simdWidth = insn->state.execWidth;
+    ir::Register tmp;
+    GenRegister gr;
+
+    // This will append the temporary register in the instruction block
+    this->block = block;
+    tmp = this->reg(ir::getFamily(type), simdWidth == 1);
+    gr =  this->selReg(tmp, type);
+    if (needMov) {
+      // Generate the MOV instruction and replace the register in the instruction
+      SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
+      mov->src(0) = GenRegister::retype(insn->src(regID), gr.type);
+      mov->state = GenInstructionState(simdWidth);
+      if (this->isScalarReg(insn->src(regID).reg()))
+        mov->state.noMask = 1;
+      mov->dst(0) = gr;
+      insn->prepend(*mov);
+    }
+    insn->src(regID) = gr;
+
+    return tmp;
+  }
+
+  ir::Register Selection::Opaque::replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) {
+    SelectionBlock *block = insn->parent;
+    uint32_t simdWidth;
+    if (!GenRegister::isNull(insn->dst(regID)))
+      simdWidth = this->isScalarReg(insn->dst(regID).reg()) ? 1 : insn->state.execWidth;
+    else {
+      GBE_ASSERT(needMov == false);
+      simdWidth = insn->state.execWidth;
+    }
+    ir::Register tmp;
+    GenRegister gr;
+    this->block = block;
+    tmp = this->reg(ir::getFamily(type));
+    gr = this->selReg(tmp, type);
+    if (needMov) {
+    // Generate the MOV instruction and replace the register in the instruction
+      SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
+      mov->dst(0) = GenRegister::retype(insn->dst(regID), gr.type);
+      mov->state = GenInstructionState(simdWidth);
+      if (simdWidth == 1) {
+        mov->state.noMask = 1;
+        mov->src(0) = GenRegister::retype(GenRegister::vec1(GEN_GENERAL_REGISTER_FILE, gr.reg()), gr.type);
+      } else
+        mov->src(0) = gr;
+      insn->append(*mov);
+    }
+    insn->dst(regID) = gr;
+    return tmp;
+  }
+
+#define SEL_REG(SIMD16, SIMD8, SIMD1) \
+  if (ctx.sel->isScalarReg(reg) == true) \
+    return GenRegister::retype(GenRegister::SIMD1(reg), genType); \
+  else if (simdWidth == 8) \
+    return GenRegister::retype(GenRegister::SIMD8(reg), genType); \
+  else { \
+    GBE_ASSERT (simdWidth == 16); \
+    return GenRegister::retype(GenRegister::SIMD16(reg), genType); \
+  }
+
+  GenRegister Selection::Opaque::selReg(ir::Register reg, ir::Type type) const {
+    using namespace ir;
+    const uint32_t genType = getGenType(type);
+    const uint32_t simdWidth = ctx.getSimdWidth();
+    const RegisterData data = file.get(reg);
+    const RegisterFamily family = data.family;
+    switch (family) {
+      case FAMILY_BOOL: SEL_REG(uw16grf, uw8grf, uw1grf); break;
+      case FAMILY_WORD: SEL_REG(uw16grf, uw8grf, uw1grf); break;
+      case FAMILY_BYTE: SEL_REG(ub16grf, ub8grf, ub1grf); break;
+      case FAMILY_DWORD: SEL_REG(f16grf, f8grf, f1grf); break;
+      case FAMILY_QWORD: SEL_REG(df16grf, df8grf, df1grf); break;
+      default: NOT_SUPPORTED;
+    }
+    GBE_ASSERT(false);
+    return GenRegister();
+  }
+
+#undef SEL_REG
+
+  GenRegister Selection::Opaque::selRegQn(ir::Register reg, uint32_t q, ir::Type type) const {
+    GenRegister sreg = this->selReg(reg, type);
+    sreg.quarter = q;
+    return sreg;
+  }
+
+  /*! Syntactic sugar for method declaration */
+  typedef const GenRegister &Reg;
+
+  void Selection::Opaque::LABEL(ir::LabelIndex index) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_LABEL, 0, 0);
+    insn->index = uint16_t(index);
+  }
+
+  void Selection::Opaque::BARRIER(GenRegister src, GenRegister fence, uint32_t barrierType) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 1, 1);
+    insn->src(0) = src;
+    insn->dst(0) = fence;
+    insn->extra.barrierType = barrierType;
+  }
+
+  void Selection::Opaque::FENCE(GenRegister dst) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_FENCE, 1, 0);
+    insn->dst(0) = dst;
+  }
+
+  int Selection::Opaque::JMPI(Reg src, ir::LabelIndex index, ir::LabelIndex origin) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_JMPI, 0, 1);
+    insn->src(0) = src;
+    insn->index = uint16_t(index);
+    insn->extra.longjmp = abs(index - origin) > 800;
+    return insn->extra.longjmp ? 2 : 1;
+  }
+
+  void Selection::Opaque::BRD(Reg src, ir::LabelIndex jip) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BRD, 0, 1);
+    insn->src(0) = src;
+    insn->index = uint16_t(jip);
+  }
+
+  void Selection::Opaque::BRC(Reg src, ir::LabelIndex jip, ir::LabelIndex uip) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BRC, 0, 1);
+    insn->src(0) = src;
+    insn->index = uint16_t(jip);
+    insn->index1 = uint16_t(uip);
+  }
+
+  void Selection::Opaque::IF(Reg src, ir::LabelIndex jip, ir::LabelIndex uip) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_IF, 0, 1);
+    insn->src(0) = src;
+    insn->index = uint16_t(jip);
+    insn->index1 = uint16_t(uip);
+  }
+
+  void Selection::Opaque::ENDIF(Reg src, ir::LabelIndex jip) {
+    this->block->endifLabel = this->newAuxLabel();
+    this->LABEL(this->block->endifLabel);
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1);
+    insn->src(0) = src;
+    insn->index = uint16_t(this->block->endifLabel);
+  }
+
+  void Selection::Opaque::CMP(uint32_t conditional, Reg src0, Reg src1, Reg dst) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_CMP, 1, 2);
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    insn->dst(0) = dst;
+    insn->extra.function = conditional;
+  }
+
+  void Selection::Opaque::SEL_CMP(uint32_t conditional, Reg dst, Reg src0, Reg src1) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_SEL_CMP, 1, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    insn->extra.function = conditional;
+  }
+  void Selection::Opaque::INDIRECT_MOVE(Reg dst, Reg src) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_INDIRECT_MOVE, 1, 1);
+    insn->dst(0) = dst;
+    insn->src(0) = src;
+  }
+
+  void Selection::Opaque::ATOMIC(Reg dst, uint32_t function,
+                                     uint32_t srcNum, Reg src0,
+                                     Reg src1, Reg src2, uint32_t bti) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_ATOMIC, 1, srcNum);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    if(srcNum > 1) insn->src(1) = src1;
+    if(srcNum > 2) insn->src(2) = src2;
+    insn->extra.function = function;
+    insn->setbti(bti);
+    SelectionVector *vector = this->appendVector();
+
+    vector->regNum = srcNum;
+    vector->reg = &insn->src(0);
+    vector->isSrc = 1;
+  }
+
+  void Selection::Opaque::EOT(void) { this->appendInsn(SEL_OP_EOT, 0, 0); }
+  void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); }
+  void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, 0, 0); }
+
+  void Selection::Opaque::READ64(Reg addr,
+                                 const GenRegister *dst,
+                                 uint32_t elemNum,
+                                 uint32_t bti)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum, 1);
+    SelectionVector *srcVector = this->appendVector();
+    SelectionVector *dstVector = this->appendVector();
+
+    // Regular instruction to encode
+    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+      insn->dst(elemID) = dst[elemID];
+    insn->src(0) = addr;
+    insn->setbti(bti);
+    insn->extra.elem = elemNum;
+
+    dstVector->regNum = elemNum;
+    dstVector->isSrc = 0;
+    dstVector->reg = &insn->dst(0);
+
+    srcVector->regNum = 1;
+    srcVector->isSrc = 1;
+    srcVector->reg = &insn->src(0);
+  }
+
+  void Selection::Opaque::UNTYPED_READ(Reg addr,
+                                       const GenRegister *dst,
+                                       uint32_t elemNum,
+                                       uint32_t bti)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_READ, elemNum, 1);
+    SelectionVector *srcVector = this->appendVector();
+    SelectionVector *dstVector = this->appendVector();
+    if (this->isScalarReg(dst[0].reg()))
+      insn->state.noMask = 1;
+    // Regular instruction to encode
+    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+      insn->dst(elemID) = dst[elemID];
+    insn->src(0) = addr;
+    insn->setbti(bti);
+    insn->extra.elem = elemNum;
+
+    // Sends require contiguous allocation
+    dstVector->regNum = elemNum;
+    dstVector->isSrc = 0;
+    dstVector->reg = &insn->dst(0);
+
+    srcVector->regNum = 1;
+    srcVector->isSrc = 1;
+    srcVector->reg = &insn->src(0);
+  }
+
+  void Selection::Opaque::WRITE64(Reg addr,
+                                  const GenRegister *src,
+                                  uint32_t srcNum,
+                                  uint32_t bti)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, 0, srcNum + 1);
+    SelectionVector *vector = this->appendVector();
+
+    // Regular instruction to encode
+    insn->src(0) = addr;
+    for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+      insn->src(elemID + 1) = src[elemID];
+
+    insn->setbti(bti);
+    insn->extra.elem = srcNum;
+
+    vector->regNum = srcNum + 1;
+    vector->reg = &insn->src(0);
+    vector->isSrc = 1;
+  }
+
+  void Selection::Opaque::UNTYPED_WRITE(Reg addr,
+                                        const GenRegister *src,
+                                        uint32_t elemNum,
+                                        uint32_t bti)
+  {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNTYPED_WRITE, 0, elemNum+1);
+    SelectionVector *vector = this->appendVector();
+
+    // Regular instruction to encode
+    insn->src(0) = addr;
+    for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+      insn->src(elemID+1) = src[elemID];
+    insn->setbti(bti);
+    insn->extra.elem = elemNum;
+
+    // Sends require contiguous allocation for the sources
+    vector->regNum = elemNum+1;
+    vector->reg = &insn->src(0);
+    vector->isSrc = 1;
+  }
+
+  void Selection::Opaque::BYTE_GATHER(Reg dst, Reg addr, uint32_t elemSize, uint32_t bti) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_GATHER, 1, 1);
+    SelectionVector *srcVector = this->appendVector();
+    SelectionVector *dstVector = this->appendVector();
+
+    if (this->isScalarReg(dst.reg()))
+      insn->state.noMask = 1;
+    // Instruction to encode
+    insn->src(0) = addr;
+    insn->dst(0) = dst;
+    insn->setbti(bti);
+    insn->extra.elem = elemSize;
+
+    // byte gather requires vector in the sense that scalar are not allowed
+    // (yet)
+    dstVector->regNum = 1;
+    dstVector->isSrc = 0;
+    dstVector->reg = &insn->dst(0);
+    srcVector->regNum = 1;
+    srcVector->isSrc = 1;
+    srcVector->reg = &insn->src(0);
+  }
+
+  void Selection::Opaque::BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_BYTE_SCATTER, 0, 2);
+    SelectionVector *vector = this->appendVector();
+
+    // Instruction to encode
+    insn->src(0) = addr;
+    insn->src(1) = src;
+    insn->setbti(bti);
+    insn->extra.elem = elemSize;
+
+    // value and address are contiguous in the send
+    vector->regNum = 2;
+    vector->isSrc = 1;
+    vector->reg = &insn->src(0);
+  }
+
+  void Selection::Opaque::DWORD_GATHER(Reg dst, Reg addr, uint32_t bti) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_DWORD_GATHER, 1, 1);
+    SelectionVector *vector = this->appendVector();
+    SelectionVector *srcVector = this->appendVector();
+
+    if (this->isScalarReg(dst.reg()))
+      insn->state.noMask = 1;
+    insn->src(0) = addr;
+    insn->dst(0) = dst;
+    insn->setbti(bti);
+    vector->regNum = 1;
+    vector->isSrc = 0;
+    vector->reg = &insn->dst(0);
+    srcVector->regNum = 1;
+    srcVector->isSrc = 1;
+    srcVector->reg = &insn->src(0);
+  }
+
+  void Selection::Opaque::UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_UNPACK_BYTE, elemNum, 1);
+    insn->src(0) = src;
+    for(uint32_t i = 0; i < elemNum; i++)
+      insn->dst(i) = dst[i];
+  }
+  void Selection::Opaque::PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_PACK_BYTE, 1, elemNum);
+    for(uint32_t i = 0; i < elemNum; i++)
+      insn->src(i) = src[i];
+    insn->dst(0) = dst;
+  }
+
+  void Selection::Opaque::MATH(Reg dst, uint32_t function, Reg src0, Reg src1) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MATH, 1, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    insn->extra.function = function;
+  }
+
+  void Selection::Opaque::MATH(Reg dst, uint32_t function, Reg src) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_MATH, 1, 1);
+    insn->dst(0) = dst;
+    insn->src(0) = src;
+    insn->extra.function = function;
+  }
+
+  void Selection::Opaque::I64MUL(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MUL, 7, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 6; i++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64DIV(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64DIV, 14, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 13; i++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64REM(Reg dst, Reg src0, Reg src1, GenRegister tmp[13]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64REM, 14, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 13; i++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::ALU1(SelectionOpcode opcode, Reg dst, Reg src) {
+    SelectionInstruction *insn = this->appendInsn(opcode, 1, 1);
+    insn->dst(0) = dst;
+    insn->src(0) = src;
+  }
+
+  void Selection::Opaque::ALU1WithTemp(SelectionOpcode opcode, Reg dst, Reg src, Reg temp) {
+    SelectionInstruction *insn = this->appendInsn(opcode, 2, 1);
+    insn->dst(0) = dst;
+    insn->src(0) = src;
+    insn->dst(1) = temp;
+  }
+
+  void Selection::Opaque::ALU2(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1) {
+    SelectionInstruction *insn = this->appendInsn(opcode, 1, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+  }
+
+  void Selection::Opaque::ALU2WithTemp(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg temp) {
+    SelectionInstruction *insn = this->appendInsn(opcode, 2, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    insn->dst(1) = temp;
+  }
+
+  void Selection::Opaque::ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2) {
+    SelectionInstruction *insn = this->appendInsn(opcode, 1, 3);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    insn->src(2) = src2;
+  }
+
+  void Selection::Opaque::I64CMP(uint32_t conditional, Reg src0, Reg src1, GenRegister tmp[3]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64CMP, 3, 2);
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i=0; i<3; i++)
+      insn->dst(i) = tmp[i];
+    insn->extra.function = conditional;
+  }
+
+  void Selection::Opaque::I64SATADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATADD, 6, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i=0; i<5; i++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[5]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64SATSUB, 6, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i=0; i<5; i++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::CONVI64_TO_F(Reg dst, Reg src, GenRegister tmp[6]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVI64_TO_F, 7, 1);
+    insn->dst(0) = dst;
+    insn->src(0) = src;
+    for(int i = 0; i < 6; i ++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::CONVF_TO_I64(Reg dst, Reg src, GenRegister tmp[2]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_CONVF_TO_I64, 3, 1);
+    insn->dst(0) = dst;
+    insn->src(0) = src;
+    for(int i = 0; i < 2; i ++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64MADSAT(Reg dst, Reg src0, Reg src1, Reg src2, GenRegister tmp[9]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64MADSAT, 10, 3);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    insn->src(2) = src2;
+    for(int i = 0; i < 9; i ++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64_MUL_HI(Reg dst, Reg src0, Reg src1, GenRegister tmp[9]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64_MUL_HI, 10, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 9; i ++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64HADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64HADD, 5, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 4; i ++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64RHADD(Reg dst, Reg src0, Reg src1, GenRegister tmp[4]) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_I64RHADD, 5, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 4; i ++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  void Selection::Opaque::I64Shift(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) {
+    SelectionInstruction *insn = this->appendInsn(opcode, 7, 2);
+    insn->dst(0) = dst;
+    insn->src(0) = src0;
+    insn->src(1) = src1;
+    for(int i = 0; i < 6; i ++)
+      insn->dst(i + 1) = tmp[i];
+  }
+
+  // Boiler plate to initialize the selection library at c++ pre-main
+  static SelectionLibrary *selLib = NULL;
+  static void destroySelectionLibrary(void) { GBE_DELETE(selLib); }
+  static struct SelectionLibraryInitializer {
+    SelectionLibraryInitializer(void) {
+      selLib = GBE_NEW_NO_ARG(SelectionLibrary);
+      atexit(destroySelectionLibrary);
+    }
+  } selectionLibraryInitializer;
+
+  bool Selection::Opaque::isRoot(const ir::Instruction &insn) const {
+    if (insn.getDstNum() > 1 ||
+        insn.hasSideEffect() ||
+        insn.isMemberOf<ir::BranchInstruction>() ||
+        insn.isMemberOf<ir::LabelInstruction>())
+    return true;
+
+    // No side effect, not a branch and no destination? Impossible
+    GBE_ASSERT(insn.getDstNum() == 1);
+
+    // Root if alive outside the block.
+    // XXX we should use Value and not registers in liveness info
+    const ir::BasicBlock *insnBlock = insn.getParent();
+    const ir::Liveness &liveness = this->ctx.getLiveness();
+    const ir::Liveness::LiveOut &liveOut = liveness.getLiveOut(insnBlock);
+    const ir::Register reg = insn.getDst(0);
+    if (liveOut.contains(reg))
+      return true;
+
+    // The instruction is only used in the current basic block
+    return false;
+  }
+
+  uint32_t Selection::Opaque::buildBasicBlockDAG(const ir::BasicBlock &bb)
+  {
+    using namespace ir;
+
+    // Clear all registers
+    for (uint32_t regID = 0; regID < this->regNum; ++regID)
+      this->regDAG[regID] = NULL;
+
+    this->block->hasBarrier = false;
+    this->block->hasBranch = bb.getLastInstruction()->getOpcode() == OP_BRA ||
+                             bb.getLastInstruction()->getOpcode() == OP_RET;
+    if (!this->block->hasBranch)
+      this->block->endifOffset = -1;
+
+    // Build the DAG on the fly
+    uint32_t insnNum = 0;
+    const_cast<BasicBlock&>(bb).foreach([&](const Instruction &insn) {
+      if (insn.getOpcode() == OP_SYNC)
+        this->block->hasBarrier = true;
+
+      // Build a selectionDAG node for instruction
+      SelectionDAG *dag = this->newSelectionDAG(insn);
+
+      // Point to non-root children
+      const uint32_t srcNum = insn.getSrcNum();
+      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+        const ir::Register reg = insn.getSrc(srcID);
+        SelectionDAG *child = this->regDAG[reg];
+        if (child) {
+          const ir::Instruction &childInsn = child->insn;
+          const uint32_t childSrcNum = childInsn.getSrcNum();
+
+          // We can merge a child only if its sources are still valid
+          bool mergeable = true;
+          for (uint32_t otherID = 0; otherID < childSrcNum; ++otherID) {
+            const SelectionDAG *srcDAG = child->child[otherID];
+            const ir::Register srcReg = childInsn.getSrc(otherID);
+            SelectionDAG *currDAG = this->regDAG[srcReg];
+            if (srcDAG != currDAG) {
+              mergeable = false;
+              break;
+            }
+          }
+          if (mergeable) dag->setAsMergeable(srcID);
+          dag->child[srcID] = child;
+          // Check whether this bool is used as a normal source
+          // oprand other than BRA/SEL.
+          if (getRegisterFamily(reg) == FAMILY_BOOL) {
+            if (insn.getOpcode() != OP_BRA &&
+                 (insn.getOpcode() != OP_SEL ||
+                   (insn.getOpcode() == OP_SEL && srcID != 0)))
+              child->computeBool = true;
+          }
+          child->isUsed = true;
+        } else
+          dag->child[srcID] = NULL;
+      }
+
+      // Make it a root if we must
+      if (this->isRoot(insn)) dag->isRoot = 1;
+
+      // Save the DAG <-> instruction mapping
+      this->insnDAG[insnNum++] = dag;
+
+      // Associate all output registers to this instruction
+      const uint32_t dstNum = insn.getDstNum();
+      for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+        const ir::Register reg = insn.getDst(dstID);
+        this->regDAG[reg] = dag;
+      }
+    });
+
+    return insnNum;
+  }
+
+  void Selection::Opaque::matchBasicBlock(const ir::BasicBlock &bb, uint32_t insnNum)
+  {
+    // Bottom up code generation
+    bool needEndif = this->block->hasBranch == false && !this->block->hasBarrier;
+
+    if(needEndif) {
+      const ir::BasicBlock *next = bb.getNextBlock();
+      this->ENDIF(GenRegister::immd(0), next->getLabelIndex());
+    }
+
+    for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
+      // Process all possible patterns for this instruction
+      SelectionDAG &dag = *insnDAG[insnID];
+      if (dag.isRoot) {
+        const ir::Instruction &insn = dag.insn;
+        const ir::Opcode opcode = insn.getOpcode();
+        auto it = selLib->patterns[opcode].begin();
+        const auto end = selLib->patterns[opcode].end();
+
+        // Start a new code fragment
+        this->startBackwardGeneration();
+        // If there is no branch at the end of this block.
+
+        // Try all the patterns from best to worst
+        do {
+          if ((*it)->emit(*this, dag))
+            break;
+          ++it;
+        } while (it != end);
+        GBE_ASSERT(it != end);
+        // If we are in if/endif fix mode, and this block is
+        // large enough, we need to insert endif/if pair to eliminate
+        // the too long if/endif block.
+        if (this->ctx.getIFENDIFFix() &&
+            this->block->insnList.size() != 0 &&
+            this->block->insnList.size() % 1000 == 0 &&
+            (uint16_t)this->block->endifLabel != 0) {
+          ir::LabelIndex jip = this->block->endifLabel;
+          this->ENDIF(GenRegister::immd(0), jip);
+          this->push();
+            this->curr.predicate = GEN_PREDICATE_NORMAL;
+            this->IF(GenRegister::immd(0), jip, jip);
+          this->pop();
+          this->block->isLargeBlock = true;
+        }
+
+        // Output the code in the current basic block
+        this->endBackwardGeneration();
+      }
+    }
+  }
+
+  void Selection::Opaque::select(void)
+  {
+    using namespace ir;
+    const Function &fn = ctx.getFunction();
+
+    // Perform the selection per basic block
+    fn.foreachBlock([&](const BasicBlock &bb) {
+      this->dagPool.rewind();
+      this->appendBlock(bb);
+      const uint32_t insnNum = this->buildBasicBlockDAG(bb);
+      this->matchBasicBlock(bb, insnNum);
+    });
+   }
+
+  void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum,
+                                 GenRegister *msgPayloads, uint32_t msgNum,
+                                 uint32_t bti, uint32_t sampler, bool isLD, bool isUniform) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, msgNum);
+    SelectionVector *dstVector = this->appendVector();
+    SelectionVector *msgVector = this->appendVector();
+
+    // Regular instruction to encode
+    for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
+      insn->dst(elemID) = dst[elemID];
+    for (uint32_t elemID = 0; elemID < msgNum; ++elemID)
+      insn->src(elemID) = msgPayloads[elemID];
+
+    // Sends require contiguous allocation
+    dstVector->regNum = dstNum;
+    dstVector->isSrc = 0;
+    dstVector->reg = &insn->dst(0);
+
+    // Only the messages require contiguous registers.
+    msgVector->regNum = msgNum;
+    msgVector->isSrc = 1;
+    msgVector->reg = &insn->src(0);
+
+    insn->setbti(bti);
+    insn->extra.sampler = sampler;
+    insn->extra.rdmsglen = msgNum;
+    insn->extra.isLD = isLD;
+    insn->extra.isUniform = isUniform;
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Code selection public implementation
+  ///////////////////////////////////////////////////////////////////////////
+
+  Selection::Selection(GenContext &ctx) {
+    this->blockList = NULL;
+    this->opaque = GBE_NEW(Selection::Opaque, ctx);
+  }
+
+  Selection75::Selection75(GenContext &ctx) : Selection(ctx) {
+    this->opaque->setPatchSLMAddr(true);
+  }
+
+  void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
+                                      uint32_t bti, bool is3D) {
+    uint32_t elemID = 0;
+    uint32_t i;
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_TYPED_WRITE, 0, msgNum);
+    SelectionVector *msgVector = this->appendVector();;
+
+    for( i = 0; i < msgNum; ++i, ++elemID)
+      insn->src(elemID) = msgs[i];
+
+    insn->setbti(bti);
+    insn->extra.msglen = msgNum;
+    insn->extra.is3DWrite = is3D;
+    // Sends require contiguous allocation
+    msgVector->regNum = msgNum;
+    msgVector->isSrc = 1;
+    msgVector->reg = &insn->src(0);
+  }
+
+  Selection::~Selection(void) { GBE_DELETE(this->opaque); }
+
+  void Selection::select(void) {
+    this->opaque->select();
+    this->blockList = &this->opaque->blockList;
+  }
+
+  uint32_t Selection::getLargestBlockSize(void) const {
+    return this->opaque->getLargestBlockSize();
+  }
+
+  uint32_t Selection::getVectorNum(void) const {
+    return this->opaque->getVectorNum();
+  }
+
+  uint32_t Selection::getRegNum(void) const {
+    return this->opaque->getRegNum();
+  }
+
+  ir::RegisterFamily Selection::getRegisterFamily(ir::Register reg) const {
+    return this->opaque->getRegisterFamily(reg);
+  }
+
+  ir::RegisterData Selection::getRegisterData(ir::Register reg) const {
+    return this->opaque->getRegisterData(reg);
+  }
+
+  ir::Register Selection::replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) {
+    return this->opaque->replaceSrc(insn, regID, type, needMov);
+  }
+
+  ir::Register Selection::replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type, bool needMov) {
+    return this->opaque->replaceDst(insn, regID, type, needMov);
+  }
+  bool Selection::spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool) {
+    return this->opaque->spillRegs(spilledRegs, registerPool);
+  }
+
+  bool Selection::isScalarReg(const ir::Register &reg) const {
+    return this->opaque->isScalarReg(reg);
+  }
+
+  SelectionInstruction *Selection::create(SelectionOpcode opcode, uint32_t dstNum, uint32_t srcNum) {
+    return this->opaque->create(opcode, dstNum, srcNum);
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Implementation of all patterns
+  ///////////////////////////////////////////////////////////////////////////
+
+  bool canGetRegisterFromImmediate(const ir::Instruction &insn) {
+    using namespace ir;
+    const auto &childInsn = cast<LoadImmInstruction>(insn);
+    const auto &imm = childInsn.getImmediate();
+    if(imm.getType() != TYPE_DOUBLE && imm.getType() != TYPE_S64 && imm.getType() != TYPE_U64)
+      return true;
+    return false;
+  }
+
+  GenRegister getRegisterFromImmediate(ir::Immediate imm, ir::Type type, bool negate = false)
+  {
+    using namespace ir;
+    int sign = negate ? -1 : 1;
+    switch (type) {
+      case TYPE_U32:   return GenRegister::immud(imm.getIntegerValue() * sign);
+      case TYPE_S32:   return GenRegister::immd(imm.getIntegerValue() * sign);
+      case TYPE_FLOAT: return GenRegister::immf(imm.getFloatValue() * sign);
+      case TYPE_U16: return GenRegister::immuw(imm.getIntegerValue() * sign);
+      case TYPE_S16: return  GenRegister::immw((int16_t)imm.getIntegerValue() * sign);
+      case TYPE_U8:  return GenRegister::immuw(imm.getIntegerValue() * sign);
+      case TYPE_S8:  return GenRegister::immw((int8_t)imm.getIntegerValue() * sign);
+      case TYPE_DOUBLE: return GenRegister::immdf(imm.getDoubleValue() * sign);
+      case TYPE_BOOL: return GenRegister::immuw(-imm.getIntegerValue());  //return 0xffff when true
+      default: NOT_SUPPORTED; return GenRegister::immuw(0);
+    }
+  }
+
+  BVAR(OCL_OPTIMIZE_IMMEDIATE, true);
+  void Selection::Opaque::getSrcGenRegImm(SelectionDAG &dag,
+                                          SelectionDAG *dag0, SelectionDAG *dag1,
+                                          GenRegister &src0, GenRegister &src1,
+                                          ir::Type type, bool &inverse) {
+    using namespace ir;
+    inverse = false;
+    // Right source can always be an immediate
+    const int src0Index = dag.insn.isMemberOf<SelectInstruction>() ? SelectInstruction::src0Index : 0;
+    const int src1Index = dag.insn.isMemberOf<SelectInstruction>() ? SelectInstruction::src1Index : 1;
+    if (OCL_OPTIMIZE_IMMEDIATE && dag1 != NULL && dag1->insn.getOpcode() == OP_LOADI &&
+        canGetRegisterFromImmediate(dag1->insn)) {
+      const auto &childInsn = cast<LoadImmInstruction>(dag1->insn);
+      src0 = this->selReg(dag.insn.getSrc(src0Index), type);
+      src1 = getRegisterFromImmediate(childInsn.getImmediate(), type);
+      if (dag0) dag0->isRoot = 1;
+    }
+    // Left source cannot be immediate but it is OK if we can commute
+    else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL && dag.insn.isMemberOf<BinaryInstruction>() &&
+             ((cast<BinaryInstruction>(dag.insn)).commutes() || dag.insn.getOpcode() == OP_SUB) &&
+             dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn)) {
+      const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
+      src0 = dag.insn.getOpcode() != OP_SUB ?
+             this->selReg(dag.insn.getSrc(src1Index), type) :
+             GenRegister::negate(this->selReg(dag.insn.getSrc(src1Index), type));
+      Immediate imm = childInsn.getImmediate();
+      src1 = getRegisterFromImmediate(imm, type, dag.insn.getOpcode() == OP_SUB);
+      if (dag1) dag1->isRoot = 1;
+    }
+    // If it's a compare instruction, theoritically, we can easily revert the condition code to
+    // switch the two operands. But we can't do that for float due to the NaN's exist.
+    // For a normal select instruction, we can always inverse the predication to switch the two
+    // operands' position.
+    else if (OCL_OPTIMIZE_IMMEDIATE && dag0 != NULL &&
+             dag0->insn.getOpcode() == OP_LOADI && canGetRegisterFromImmediate(dag0->insn) &&
+             ((dag.insn.isMemberOf<CompareInstruction>() && type != TYPE_FLOAT && type != TYPE_DOUBLE) ||
+              (dag.insn.isMemberOf<SelectInstruction>()))) {
+      const auto &childInsn = cast<LoadImmInstruction>(dag0->insn);
+      src0 = this->selReg(dag.insn.getSrc(src1Index), type);
+      src1 = getRegisterFromImmediate(childInsn.getImmediate(), type);
+      inverse = true;
+      if (dag1) dag1->isRoot = 1;
+    }
+    // Just grab the two sources
+    else {
+      src0 = this->selReg(dag.insn.getSrc(src0Index), type);
+      src1 = this->selReg(dag.insn.getSrc(src1Index), type);
+      markAllChildren(dag);
+    }
+  }
+
+  void Selection::Opaque::getSrcGenRegImm(SelectionDAG &dag, GenRegister &src0,
+                                       GenRegister &src1, ir::Type type,
+                                       bool &inverse) {
+    SelectionDAG *dag0 = dag.child[0];
+    SelectionDAG *dag1 = dag.child[1];
+    getSrcGenRegImm(dag, dag0, dag1, src0, src1, type, inverse);
+  }
+
+
+  /*! Template for the one-to-many instruction patterns */
+  template <typename T, typename U>
+  class OneToManyPattern : public SelectionPattern
+  {
+  public:
+    /*! Register the pattern for all opcodes of the family */
+    OneToManyPattern(uint32_t insnNum, uint32_t cost) :
+      SelectionPattern(insnNum, cost)
+    {
+      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+        if (ir::isOpcodeFrom<U>(ir::Opcode(op)) == true)
+          this->opcodes.push_back(ir::Opcode(op));
+    }
+    /*! Call the child method with the proper prototype */
+    virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+      bool markChildren = true;
+      if (static_cast<const T*>(this)->emitOne(sel, ir::cast<U>(dag.insn), markChildren)) {
+        if (markChildren)
+          markAllChildren(dag);
+        return true;
+      }
+      return false;
+    }
+  };
+
+/*! Declare a naive one-to-many pattern */
+#define DECL_PATTERN(FAMILY) \
+  struct FAMILY##Pattern : public OneToManyPattern<FAMILY##Pattern, ir::FAMILY>
+
+#define DECL_CTOR(FAMILY, INSN_NUM, COST) \
+  FAMILY##Pattern(void) : OneToManyPattern<FAMILY##Pattern, ir::FAMILY>(INSN_NUM, COST) {}
+
+  /*! Unary instruction patterns */
+  DECL_PATTERN(UnaryInstruction)
+  {
+    static ir::Type getType(const ir::Opcode opcode, const ir::Type insnType) {
+      if (insnType == ir::TYPE_S64 || insnType == ir::TYPE_U64 || insnType == ir::TYPE_S8 || insnType == ir::TYPE_U8)
+        return insnType;
+      if (opcode == ir::OP_FBH || opcode == ir::OP_FBL)
+        return ir::TYPE_U32;
+      if (insnType == ir::TYPE_S16 || insnType == ir::TYPE_U16)
+        return insnType;
+      if (insnType == ir::TYPE_BOOL)
+        return ir::TYPE_U16;
+      return ir::TYPE_FLOAT;
+    }
+
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::UnaryInstruction &insn, bool &markChildren) const {
+      const ir::Opcode opcode = insn.getOpcode();
+      const ir::Type insnType = insn.getType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), getType(opcode, insnType));
+      const GenRegister src = sel.selReg(insn.getSrc(0), getType(opcode, insnType));
+      sel.push();
+        if (sel.isScalarReg(insn.getDst(0)) == true) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+        switch (opcode) {
+          case ir::OP_ABS:
+            if (insn.getType() == ir::TYPE_S32) {
+              const GenRegister src_ = GenRegister::retype(src, GEN_TYPE_D);
+              const GenRegister dst_ = GenRegister::retype(dst, GEN_TYPE_D);
+              sel.MOV(dst_, GenRegister::abs(src_));
+            } else {
+              GBE_ASSERT(insn.getType() == ir::TYPE_FLOAT);
+              sel.MOV(dst, GenRegister::abs(src));
+            }
+            break;
+          case ir::OP_MOV:
+            if (dst.isdf()) {
+              ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+              sel.MOV_DF(dst, src, sel.selReg(r));
+            } else {
+              sel.push();
+                auto dag = sel.regDAG[insn.getDst(0)];
+                if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL &&
+                    dag->isUsed) {
+                sel.curr.physicalFlag = 0;
+                sel.curr.flagIndex = (uint16_t)(insn.getDst(0));
+                sel.curr.modFlag = 1;
+              }
+              sel.MOV(dst, src);
+              sel.pop();
+            }
+            break;
+          case ir::OP_RNDD: sel.RNDD(dst, src); break;
+          case ir::OP_RNDE: sel.RNDE(dst, src); break;
+          case ir::OP_RNDU: sel.RNDU(dst, src); break;
+          case ir::OP_RNDZ: sel.RNDZ(dst, src); break;
+          case ir::OP_FBH: sel.FBH(dst, src); break;
+          case ir::OP_FBL: sel.FBL(dst, src); break;
+          case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break;
+          case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break;
+          case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break;
+          case ir::OP_EXP: sel.MATH(dst, GEN_MATH_FUNCTION_EXP, src); break;
+          case ir::OP_SQR: sel.MATH(dst, GEN_MATH_FUNCTION_SQRT, src); break;
+          case ir::OP_RSQ: sel.MATH(dst, GEN_MATH_FUNCTION_RSQ, src); break;
+          case ir::OP_RCP: sel.MATH(dst, GEN_MATH_FUNCTION_INV, src); break;
+          case ir::OP_SIMD_ANY:
+            {
+              const GenRegister constZero = GenRegister::immuw(0);;
+              const GenRegister regOne = GenRegister::uw1grf(ir::ocl::one);
+              const GenRegister flag01 = GenRegister::flag(0, 1);
+
+              sel.push();
+                int simdWidth = sel.curr.execWidth;
+                sel.curr.predicate = GEN_PREDICATE_NONE;
+                sel.curr.execWidth = 1;
+                sel.curr.noMask = 1;
+                sel.MOV(flag01, constZero);
+                sel.curr.execWidth = simdWidth;
+                sel.curr.noMask = 0;
+
+                sel.curr.flag = 0;
+                sel.curr.subFlag = 1;
+                sel.CMP(GEN_CONDITIONAL_NEQ, src, constZero);
+
+                if (sel.curr.execWidth == 16)
+                  sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+                else if (sel.curr.execWidth == 8)
+                  sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+                else
+                  NOT_IMPLEMENTED;
+                sel.SEL(dst, regOne, constZero);
+              sel.pop();
+            }
+            break;
+          case ir::OP_SIMD_ALL:
+            {
+              const GenRegister constZero = GenRegister::immuw(0);
+              const GenRegister regOne = GenRegister::uw1grf(ir::ocl::one);
+              const GenRegister flag01 = GenRegister::flag(0, 1);
+
+              sel.push();
+                int simdWidth = sel.curr.execWidth;
+                sel.curr.predicate = GEN_PREDICATE_NONE;
+                sel.curr.execWidth = 1;
+                sel.curr.noMask = 1;
+                sel.MOV(flag01, regOne);
+
+                sel.curr.execWidth = simdWidth;
+                sel.curr.noMask = 0;
+
+                sel.curr.flag = 0;
+                sel.curr.subFlag = 1;
+                sel.CMP(GEN_CONDITIONAL_NEQ, src, constZero);
+
+                if (sel.curr.execWidth == 16)
+                  sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+                else if (sel.curr.execWidth == 8)
+                  sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+                else
+                  NOT_IMPLEMENTED;
+                sel.SEL(dst, regOne, constZero);
+              sel.pop();
+            }
+            break;
+
+          default: NOT_SUPPORTED;
+        }
+      sel.pop();
+      return true;
+    }
+    DECL_CTOR(UnaryInstruction, 1, 1)
+  };
+
+
+  /*! Binary regular instruction pattern */
+  class BinaryInstructionPattern : public SelectionPattern
+  {
+  public:
+    BinaryInstructionPattern(void) : SelectionPattern(1,1) {
+      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+        if (ir::isOpcodeFrom<ir::BinaryInstruction>(ir::Opcode(op)) == true)
+          this->opcodes.push_back(ir::Opcode(op));
+    }
+
+    bool emitDivRemInst(Selection::Opaque &sel, SelectionDAG &dag, ir::Opcode op) const
+    {
+      using namespace ir;
+      const ir::BinaryInstruction &insn = cast<BinaryInstruction>(dag.insn);
+      const Type type = insn.getType();
+      GenRegister dst  = sel.selReg(insn.getDst(0), type);
+      GenRegister src0 = sel.selReg(insn.getSrc(0), type);
+      GenRegister src1 = sel.selReg(insn.getSrc(1), type);
+      const uint32_t simdWidth = sel.curr.execWidth;
+      const RegisterFamily family = getFamily(type);
+      uint32_t function = (op == OP_DIV)?
+                          GEN_MATH_FUNCTION_INT_DIV_QUOTIENT :
+                          GEN_MATH_FUNCTION_INT_DIV_REMAINDER;
+
+      //bytes and shorts must be converted to int for DIV and REM per GEN restriction
+      if((family == FAMILY_WORD || family == FAMILY_BYTE)) {
+        GenRegister tmp0, tmp1;
+        ir::Register reg = sel.reg(FAMILY_DWORD, simdWidth == 1);
+
+        tmp0 = GenRegister::udxgrf(simdWidth, reg);
+        tmp0 = GenRegister::retype(tmp0, GEN_TYPE_D);
+        sel.MOV(tmp0, src0);
+
+        tmp1 = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+        tmp1 = GenRegister::retype(tmp1, GEN_TYPE_D);
+        sel.MOV(tmp1, src1);
+
+        sel.MATH(tmp0, function, tmp0, tmp1);
+        GenRegister unpacked;
+        if(family == FAMILY_WORD) {
+          unpacked = sel.unpacked_uw(reg);
+        } else {
+          unpacked = sel.unpacked_ub(reg);
+        }
+        unpacked = GenRegister::retype(unpacked, getGenType(type));
+        sel.MOV(dst, unpacked);
+      } else if (type == TYPE_S32 || type == TYPE_U32 ) {
+        sel.MATH(dst, function, src0, src1);
+      } else if(type == TYPE_FLOAT) {
+        GBE_ASSERT(op != OP_REM);
+        sel.MATH(dst, GEN_MATH_FUNCTION_FDIV, src0, src1);
+      } else if (type == TYPE_S64 || type == TYPE_U64) {
+        GenRegister tmp[13];
+        for(int i=0; i < 13; i++) {
+          tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+          tmp[i].type = GEN_TYPE_UD;
+        }
+        sel.push();
+          sel.curr.flag = 0;
+          sel.curr.subFlag = 1;
+          if(op == OP_DIV)
+            sel.I64DIV(dst, src0, src1, tmp);
+          else
+            sel.I64REM(dst, src0, src1, tmp);
+        sel.pop();
+      }
+      markAllChildren(dag);
+      return true;
+    }
+
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+    {
+      using namespace ir;
+      const ir::BinaryInstruction &insn = cast<BinaryInstruction>(dag.insn);
+      const Opcode opcode = insn.getOpcode();
+      const Type type = insn.getType();
+      GenRegister dst  = sel.selReg(insn.getDst(0), type);
+
+      sel.push();
+
+      // Boolean values use scalars
+      if (sel.isScalarReg(insn.getDst(0)) == true) {
+        sel.curr.execWidth = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+      }
+
+      if(opcode == OP_DIV || opcode == OP_REM) {
+        bool ret = this->emitDivRemInst(sel, dag, opcode);
+        sel.pop();
+        return ret;
+      }
+      // Immediates not supported
+      if (opcode == OP_POW) {
+        GenRegister src0 = sel.selReg(insn.getSrc(0), type);
+        GenRegister src1 = sel.selReg(insn.getSrc(1), type);
+
+        if(type == TYPE_FLOAT) {
+          sel.MATH(dst, GEN_MATH_FUNCTION_POW, src0, src1);
+        } else {
+          NOT_IMPLEMENTED;
+        }
+        markAllChildren(dag);
+        sel.pop();
+        return true;
+      }
+
+      // Look for immediate values
+      GenRegister src0, src1;
+      bool inverse = false;
+      sel.getSrcGenRegImm(dag, src0, src1, type, inverse);
+      // Output the binary instruction
+      if (sel.getRegisterFamily(insn.getDst(0)) == ir::FAMILY_BOOL &&
+          dag.isUsed) {
+        GBE_ASSERT(insn.getOpcode() == OP_AND ||
+                   insn.getOpcode() == OP_OR ||
+                   insn.getOpcode() == OP_XOR);
+        sel.curr.physicalFlag = 0;
+        sel.curr.flagIndex = (uint16_t)(insn.getDst(0));
+        sel.curr.modFlag = 1;
+      }
+
+      switch (opcode) {
+        case OP_ADD:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+            GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64);
+            sel.I64ADD(dst, src0, src1, t);
+          } else
+            sel.ADD(dst, src0, src1);
+          break;
+        case OP_ADDSAT:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+            GenRegister tmp[5];
+            for(int i=0; i<5; i++) {
+              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+              tmp[i].type = GEN_TYPE_UD;
+            }
+            sel.push();
+              sel.curr.flag = 0;
+              sel.curr.subFlag = 1;
+              sel.I64SATADD(dst, src0, src1, tmp);
+            sel.pop();
+            break;
+          }
+          sel.push();
+            sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
+            sel.ADD(dst, src0, src1);
+          sel.pop();
+          break;
+        case OP_XOR:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+            sel.I64XOR(dst, src0, src1);
+          else
+            sel.XOR(dst, src0, src1);
+          break;
+        case OP_OR:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+            sel.I64OR(dst, src0, src1);
+          else
+            sel.OR(dst, src0, src1);
+          break;
+        case OP_AND:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64)
+            sel.I64AND(dst, src0, src1);
+          else
+            sel.AND(dst, src0, src1);
+          break;
+        case OP_SUB:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+            GenRegister t = sel.selReg(sel.reg(RegisterFamily::FAMILY_QWORD), Type::TYPE_S64);
+            sel.I64SUB(dst, src0, src1, t);
+          } else
+            sel.ADD(dst, src0, GenRegister::negate(src1));
+          break;
+        case OP_SUBSAT:
+          if (type == Type::TYPE_U64 || type == Type::TYPE_S64) {
+            GenRegister tmp[5];
+            for(int i=0; i<5; i++) {
+              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+              tmp[i].type = GEN_TYPE_UD;
+            }
+            sel.push();
+              sel.curr.flag = 0;
+              sel.curr.subFlag = 1;
+              sel.I64SATSUB(dst, src0, src1, tmp);
+            sel.pop();
+            break;
+          }
+          sel.push();
+            sel.curr.saturate = GEN_MATH_SATURATE_SATURATE;
+            sel.ADD(dst, src0, GenRegister::negate(src1));
+          sel.pop();
+          break;
+        case OP_SHL:
+          if (type == TYPE_S64 || type == TYPE_U64) {
+            GenRegister tmp[6];
+            for(int i = 0; i < 6; i ++)
+              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            sel.push();
+              sel.curr.flag = 0;
+              sel.curr.subFlag = 1;
+              sel.I64SHL(dst, src0, src1, tmp);
+            sel.pop();
+          } else
+            sel.SHL(dst, src0, src1);
+          break;
+        case OP_SHR:
+          if (type == TYPE_S64 || type == TYPE_U64) {
+            GenRegister tmp[6];
+            for(int i = 0; i < 6; i ++)
+              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            sel.push();
+              sel.curr.flag = 0;
+              sel.curr.subFlag = 1;
+              sel.I64SHR(dst, src0, src1, tmp);
+            sel.pop();
+          } else
+            sel.SHR(dst, src0, src1);
+          break;
+        case OP_ASR:
+          if (type == TYPE_S64 || type == TYPE_U64) {
+            GenRegister tmp[6];
+            for(int i = 0; i < 6; i ++)
+              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            sel.push();
+              sel.curr.flag = 0;
+              sel.curr.subFlag = 1;
+              sel.I64ASR(dst, src0, src1, tmp);
+            sel.pop();
+          } else
+            sel.ASR(dst, src0, src1);
+          break;
+        case OP_MUL_HI: {
+            GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
+            sel.MUL_HI(dst, src0, src1, temp);
+            break;
+          }
+        case OP_I64_MUL_HI:
+         {
+          GenRegister temp[9];
+          for(int i=0; i<9; i++) {
+            temp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            temp[i].type = GEN_TYPE_UD;
+          }
+          sel.push();
+            sel.curr.flag = 0;
+            sel.curr.subFlag = 1;
+            sel.I64_MUL_HI(dst, src0, src1, temp);
+          sel.pop();
+          break;
+         }
+        case OP_MUL:
+          if (type == TYPE_U32 || type == TYPE_S32) {
+            sel.pop();
+            return false;
+          } else if (type == TYPE_S64 || type == TYPE_U64) {
+            GenRegister tmp[6];
+            for(int i = 0; i < 6; i++)
+              tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            sel.I64MUL(dst, src0, src1, tmp);
+          } else
+            sel.MUL(dst, src0, src1);
+          break;
+        case OP_HADD: {
+            GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_D);
+            sel.HADD(dst, src0, src1, temp);
+            break;
+          }
+        case OP_RHADD: {
+            GenRegister temp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD)), GEN_TYPE_D);
+            sel.RHADD(dst, src0, src1, temp);
+            break;
+          }
+        case OP_I64HADD:
+         {
+          GenRegister tmp[4];
+          for(int i=0; i<4; i++)
+            tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+          sel.I64HADD(dst, src0, src1, tmp);
+          break;
+         }
+        case OP_I64RHADD:
+         {
+          GenRegister tmp[4];
+          for(int i=0; i<4; i++)
+            tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+          sel.I64RHADD(dst, src0, src1, tmp);
+          break;
+         }
+        case OP_UPSAMPLE_SHORT:
+          sel.UPSAMPLE_SHORT(dst, src0, src1);
+          break;
+        case OP_UPSAMPLE_INT:
+          sel.UPSAMPLE_INT(dst, src0, src1);
+          break;
+        case OP_UPSAMPLE_LONG:
+          sel.UPSAMPLE_LONG(dst, src0, src1);
+          break;
+        default: NOT_IMPLEMENTED;
+      }
+      sel.pop();
+      return true;
+    }
+  };
+
+  /*! MAD pattern */
+  class MulAddInstructionPattern : public SelectionPattern
+  {
+  public:
+    /*! Register the pattern for all opcodes of the family */
+    MulAddInstructionPattern(void) : SelectionPattern(2, 1) {
+       this->opcodes.push_back(ir::OP_ADD);
+       this->opcodes.push_back(ir::OP_SUB);
+    }
+
+    /*! Implements base class */
+    virtual bool emit(Selection::Opaque  &sel, SelectionDAG &dag) const
+    {
+      using namespace ir;
+
+      // XXX TODO: we need a clean support of FP_CONTRACT to remove below line 'return false'
+      // if 'pragma FP_CONTRACT OFF' is used in cl kernel, we should not do mad optimization.
+      if (!sel.ctx.relaxMath || sel.ctx.getSimdWidth() == 16)
+        return false;
+      // MAD tend to increase liveness of the sources (since there are three of
+      // them). TODO refine this strategy. Well, we should be able at least to
+      // evaluate per basic block register pressure and selectively enable
+      // disable MADs
+       if (sel.ctx.limitRegisterPressure)
+        return false;
+
+      // We are good to try. We need a MUL for one of the two sources
+      const ir::BinaryInstruction &insn = cast<ir::BinaryInstruction>(dag.insn);
+      if (insn.getType() != TYPE_FLOAT)
+        return false;
+      SelectionDAG *child0 = dag.child[0];
+      SelectionDAG *child1 = dag.child[1];
+      const GenRegister dst = sel.selReg(insn.getDst(0), TYPE_FLOAT);
+      if (child0 && child0->insn.getOpcode() == OP_MUL) {
+        GBE_ASSERT(cast<ir::BinaryInstruction>(child0->insn).getType() == TYPE_FLOAT);
+        SelectionDAG *child00 = child0->child[0];
+        SelectionDAG *child01 = child0->child[1];
+        if ((child00 && child00->insn.getOpcode() == OP_LOADI) ||
+            (child01 && child01->insn.getOpcode() == OP_LOADI) ||
+            (child1 && child1->insn.getOpcode() == OP_LOADI))
+          return false;
+        const GenRegister src0 = sel.selReg(child0->insn.getSrc(0), TYPE_FLOAT);
+        const GenRegister src1 = sel.selReg(child0->insn.getSrc(1), TYPE_FLOAT);
+        GenRegister src2 = sel.selReg(insn.getSrc(1), TYPE_FLOAT);
+        if(insn.getOpcode() == ir::OP_SUB) src2 = GenRegister::negate(src2);
+        sel.MAD(dst, src2, src0, src1); // order different on HW!
+        if (child0->child[0]) child0->child[0]->isRoot = 1;
+        if (child0->child[1]) child0->child[1]->isRoot = 1;
+        if (child1) child1->isRoot = 1;
+        return true;
+      }
+      if (child1 && child1->insn.getOpcode() == OP_MUL) {
+        GBE_ASSERT(cast<ir::BinaryInstruction>(child1->insn).getType() == TYPE_FLOAT);
+        SelectionDAG *child10 = child1->child[0];
+        SelectionDAG *child11 = child1->child[1];
+        if ((child10 && child10->insn.getOpcode() == OP_LOADI) ||
+            (child11 && child11->insn.getOpcode() == OP_LOADI) ||
+            (child0 && child0->insn.getOpcode() == OP_LOADI))
+          return false;
+        GenRegister src0 = sel.selReg(child1->insn.getSrc(0), TYPE_FLOAT);
+        const GenRegister src1 = sel.selReg(child1->insn.getSrc(1), TYPE_FLOAT);
+        const GenRegister src2 = sel.selReg(insn.getSrc(0), TYPE_FLOAT);
+        if(insn.getOpcode() == ir::OP_SUB) src0 = GenRegister::negate(src0);
+        sel.MAD(dst, src2, src0, src1); // order different on HW!
+        if (child1->child[0]) child1->child[0]->isRoot = 1;
+        if (child1->child[1]) child1->child[1]->isRoot = 1;
+        if (child0) child0->isRoot = 1;
+        return true;
+      }
+      return false;
+    }
+  };
+
+  /*! sel.{le,l,ge...} like patterns */
+  class SelectModifierInstructionPattern : public SelectionPattern
+  {
+  public:
+    /*! Register the pattern for all opcodes of the family */
+    SelectModifierInstructionPattern(void) : SelectionPattern(2, 1) {
+      this->opcodes.push_back(ir::OP_SEL);
+    }
+
+    /*! Implements base class */
+    virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+    {
+      using namespace ir;
+      SelectionDAG *cmp = dag.child[0];
+      const SelectInstruction &insn = cast<SelectInstruction>(dag.insn);
+
+      if (insn.getType() == TYPE_S64 || insn.getType() == TYPE_U64) // not support
+        return false;
+
+      // Not in this block
+      if (cmp == NULL) return false;
+
+      // We need to match a compare
+      if (cmp->insn.isMemberOf<CompareInstruction>() == false) return false;
+
+      // We look for something like that:
+      // cmp.{le,ge...} flag src0 src1
+      // sel dst flag src0 src1
+      // So both sources must match
+      if (sourceMatch(cmp, 0, &dag, 1) == false) return false;
+      if (sourceMatch(cmp, 1, &dag, 2) == false) return false;
+      // OK, we merge the instructions
+      const ir::CompareInstruction &cmpInsn = cast<CompareInstruction>(cmp->insn);
+      const ir::Opcode opcode = cmpInsn.getOpcode();
+      if(opcode == OP_ORD) return false;
+      GenRegister src0, src1;
+      const ir::Type type = cmpInsn.getType();
+      bool inverse = false;
+      sel.getSrcGenRegImm(*cmp, src0, src1, type, inverse);
+
+      const uint32_t genCmp = getGenCompare(opcode, inverse);
+      sel.push();
+        if (sel.isScalarReg(insn.getDst(0)) == true) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+
+        // Like for regular selects, we need a temporary since we cannot predicate
+        // properly
+        const uint32_t simdWidth = sel.curr.execWidth;
+        const GenRegister dst  = sel.selReg(insn.getDst(0), type);
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.execWidth = simdWidth;
+        sel.SEL_CMP(genCmp, dst, src0, src1);
+      sel.pop();
+      return true;
+    }
+  };
+
+  /*! 32 bits integer multiply needs more instructions */
+  class Int32x32MulInstructionPattern : public SelectionPattern
+  {
+  public:
+    /*! Register the pattern for all opcodes of the family */
+    Int32x32MulInstructionPattern(void) : SelectionPattern(1, 4) {
+       this->opcodes.push_back(ir::OP_MUL);
+    }
+
+    /*! Implements base class */
+    virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+    {
+      using namespace ir;
+      const ir::BinaryInstruction &insn = cast<ir::BinaryInstruction>(dag.insn);
+      const Type type = insn.getType();
+      if (type == TYPE_U32 || type == TYPE_S32) {
+        sel.push();
+          if (sel.isScalarReg(insn.getDst(0)) == true) {
+            sel.curr.execWidth = 1;
+            sel.curr.predicate = GEN_PREDICATE_NONE;
+            sel.curr.noMask = 1;
+          }
+        const uint32_t simdWidth = sel.curr.execWidth;
+
+        GenRegister dst  = sel.selReg(insn.getDst(0), type);
+        GenRegister src0 = sel.selReg(insn.getSrc(0), type);
+        GenRegister src1 = sel.selReg(insn.getSrc(1), type);
+
+        // Either left part of the 16-wide register or just a simd 8 register
+        dst  = GenRegister::retype(dst,  GEN_TYPE_D);
+        src0 = GenRegister::retype(src0, GEN_TYPE_D);
+        src1 = GenRegister::retype(src1, GEN_TYPE_D);
+        sel.curr.execWidth = 8;
+        sel.curr.quarterControl = GEN_COMPRESSION_Q1;
+        sel.MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), src0, src1);
+        sel.curr.accWrEnable = 1;
+        sel.MACH(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), src0, src1);
+        sel.curr.accWrEnable = 0;
+        if (simdWidth == 1) {
+          sel.curr.execWidth = 1;
+          sel.MOV(GenRegister::retype(dst, GEN_TYPE_F), GenRegister::vec1(GenRegister::acc()));
+        } else {
+          sel.curr.execWidth = 8;
+          sel.MOV(GenRegister::retype(dst, GEN_TYPE_F), GenRegister::acc());
+        }
+
+        // Right part of the 16-wide register now
+        if (simdWidth == 16) {
+          int predicate = sel.curr.predicate;
+          int noMask = sel.curr.noMask;
+          sel.curr.noMask = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          const GenRegister nextSrc0 = sel.selRegQn(insn.getSrc(0), 1, TYPE_S32);
+          const GenRegister nextSrc1 = sel.selRegQn(insn.getSrc(1), 1, TYPE_S32);
+          sel.MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_D), nextSrc0, nextSrc1);
+          sel.curr.accWrEnable = 1;
+          sel.MACH(GenRegister::retype(GenRegister::null(), GEN_TYPE_D), nextSrc0, nextSrc1);
+          sel.curr.accWrEnable = 0;
+          sel.curr.quarterControl = GEN_COMPRESSION_Q2;
+          if (predicate != GEN_PREDICATE_NONE || noMask != 1) {
+            const ir::Register reg = sel.reg(FAMILY_DWORD);
+            sel.MOV(GenRegister::f8grf(reg), GenRegister::acc());
+            sel.curr.noMask = noMask;;
+            sel.curr.predicate = predicate;
+            sel.MOV(GenRegister::retype(GenRegister::next(dst), GEN_TYPE_F),
+                    GenRegister::f8grf(reg));
+          } else
+            sel.MOV(GenRegister::retype(GenRegister::next(dst), GEN_TYPE_F), GenRegister::acc());
+        }
+
+        sel.pop();
+        // All children are marked as root
+        markAllChildren(dag);
+        return true;
+      } else
+        return false;
+    }
+  };
+
+  /*! 32x16 bits integer can be done in one instruction */
+  class Int32x16MulInstructionPattern : public SelectionPattern
+  {
+  public:
+    /*! Register the pattern for all opcodes of the family */
+    Int32x16MulInstructionPattern(void) : SelectionPattern(1, 1) {
+       this->opcodes.push_back(ir::OP_MUL);
+    }
+
+    bool is16BitSpecialReg(ir::Register reg) const {
+      if (reg == ir::ocl::lid0 ||
+          reg == ir::ocl::lid1 ||
+          reg == ir::ocl::lid2 ||
+          reg == ir::ocl::lsize0 ||
+          reg == ir::ocl::lsize1||
+          reg == ir::ocl::lsize2)
+        return true;
+      else
+        return false;
+    }
+
+    /*! Try to emit a multiply where child childID is a 16 immediate */
+    bool emitMulImmediate(Selection::Opaque  &sel, SelectionDAG &dag, uint32_t childID) const {
+      using namespace ir;
+      const ir::BinaryInstruction &insn = cast<ir::BinaryInstruction>(dag.insn);
+      const Register dst  = insn.getDst(0);
+      const Register src1 = insn.getSrc(childID ^ 1);
+      const SelectionDAG *src0DAG = dag.child[childID];
+      if (src0DAG != NULL) {
+        if (src0DAG->insn.getOpcode() == OP_LOADI) {
+          const auto &loadimm = cast<LoadImmInstruction>(src0DAG->insn);
+          const Immediate imm = loadimm.getImmediate();
+          const Type type = imm.getType();
+          GBE_ASSERT(type == TYPE_U32 || type == TYPE_S32);
+          if (type == TYPE_U32 && imm.getIntegerValue() <= 0xffff) {
+            sel.push();
+              if (sel.isScalarReg(insn.getDst(0)) == true) {
+                sel.curr.execWidth = 1;
+                sel.curr.predicate = GEN_PREDICATE_NONE;
+                sel.curr.noMask = 1;
+              }
+
+              sel.MUL(sel.selReg(dst, type),
+                      sel.selReg(src1, type),
+                      GenRegister::immuw(imm.getIntegerValue()));
+            sel.pop();
+            if (dag.child[childID ^ 1] != NULL)
+              dag.child[childID ^ 1]->isRoot = 1;
+            return true;
+          }
+          if (type == TYPE_S32 && (imm.getIntegerValue() >= -32768 && imm.getIntegerValue() <= 32767)) {
+            sel.push();
+              if (sel.isScalarReg(insn.getDst(0)) == true) {
+                sel.curr.execWidth = 1;
+                sel.curr.predicate = GEN_PREDICATE_NONE;
+                sel.curr.noMask = 1;
+              }
+
+              sel.MUL(sel.selReg(dst, type),
+                      sel.selReg(src1, type),
+                      GenRegister::immw(imm.getIntegerValue()));
+            sel.pop();
+            if (dag.child[childID ^ 1] != NULL)
+              dag.child[childID ^ 1]->isRoot = 1;
+            return true;
+          }
+        }
+      }
+      return false;
+    }
+
+    /*! Try to emit a multiply with a 16 bit special register */
+    bool emitMulSpecialReg(Selection::Opaque &sel, SelectionDAG &dag, uint32_t childID) const {
+      using namespace ir;
+      const BinaryInstruction &insn = cast<ir::BinaryInstruction>(dag.insn);
+      const Type type = insn.getType();
+      const Register dst  = insn.getDst(0);
+      const Register src0 = insn.getSrc(childID);
+      const Register src1 = insn.getSrc(childID ^ 1);
+      if (is16BitSpecialReg(src0)) {
+        sel.push();
+          if (sel.isScalarReg(insn.getDst(0)) == true) {
+            sel.curr.execWidth = 1;
+            sel.curr.predicate = GEN_PREDICATE_NONE;
+            sel.curr.noMask = 1;
+          }
+          sel.MUL(sel.selReg(dst, type),
+                  sel.selReg(src1, type),
+                  sel.selReg(src0, TYPE_U32));
+        sel.pop();
+        markAllChildren(dag);
+        return true;
+      }
+      return false;
+    }
+
+    virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+    {
+      using namespace ir;
+      const BinaryInstruction &insn = cast<ir::BinaryInstruction>(dag.insn);
+      const Type type = insn.getType();
+      if (type == TYPE_U32 || type == TYPE_S32) {
+        if (this->emitMulSpecialReg(sel, dag, 0))
+          return true;
+        if (this->emitMulSpecialReg(sel, dag, 1))
+          return true;
+        if (this->emitMulImmediate(sel, dag, 0))
+          return true;
+        if (this->emitMulImmediate(sel, dag, 1))
+          return true;
+      }
+      return false;
+    }
+  };
+
+#define DECL_NOT_IMPLEMENTED_ONE_TO_MANY(FAMILY) \
+  struct FAMILY##Pattern : public OneToManyPattern<FAMILY##Pattern, ir::FAMILY>\
+  {\
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::FAMILY &insn, bool &markChildren) const {\
+      NOT_IMPLEMENTED;\
+      return false;\
+    }\
+    DECL_CTOR(FAMILY, 1, 1); \
+  }
+#undef DECL_NOT_IMPLEMENTED_ONE_TO_MANY
+
+  /*! Load immediate pattern */
+  DECL_PATTERN(LoadImmInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadImmInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type type = insn.getType();
+      const Immediate imm = insn.getImmediate();
+      const GenRegister dst = sel.selReg(insn.getDst(0), type);
+
+      sel.push();
+      if (sel.isScalarReg(insn.getDst(0)) == true) {
+        sel.curr.execWidth = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+      }
+
+      switch (type) {
+        case TYPE_BOOL:
+          if (!sel.isScalarReg(insn.getDst(0)) && sel.regDAG[insn.getDst(0)]->isUsed) {
+            sel.curr.modFlag = 1;
+            sel.curr.physicalFlag = 0;
+            sel.curr.flagIndex = (uint16_t) insn.getDst(0);
+          }
+          sel.MOV(dst, imm.getIntegerValue() ? GenRegister::immuw(0xffff) : GenRegister::immuw(0));
+        break;
+        case TYPE_U32:
+        case TYPE_S32:
+        case TYPE_FLOAT:
+          sel.MOV(GenRegister::retype(dst, GEN_TYPE_F),
+                  GenRegister::immf(imm.asFloatValue()));
+        break;
+        case TYPE_U16: sel.MOV(dst, GenRegister::immuw(imm.getIntegerValue())); break;
+        case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.getIntegerValue())); break;
+        case TYPE_U8:  sel.MOV(dst, GenRegister::immuw(imm.getIntegerValue())); break;
+        case TYPE_S8:  sel.MOV(dst, GenRegister::immw(imm.getIntegerValue())); break;
+        case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.getDoubleValue()), sel.selReg(sel.reg(FAMILY_QWORD))); break;
+        case TYPE_S64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.getIntegerValue())); break;
+        case TYPE_U64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.getIntegerValue())); break;
+        default: NOT_SUPPORTED;
+      }
+      sel.pop();
+      return true;
+    }
+
+    DECL_CTOR(LoadImmInstruction, 1,1);
+  };
+
+  /*! Sync instruction */
+  DECL_PATTERN(SyncInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::SyncInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const ir::Register reg = sel.reg(FAMILY_DWORD);
+      const uint32_t params = insn.getParameters();
+
+      // A barrier is OK to start the thread synchronization *and* SLM fence
+      sel.BARRIER(GenRegister::ud8grf(reg), sel.selReg(sel.reg(FAMILY_DWORD)), params);
+      return true;
+    }
+
+    DECL_CTOR(SyncInstruction, 1,1);
+  };
+
+  INLINE uint32_t getByteScatterGatherSize(ir::Type type) {
+    using namespace ir;
+    switch (type) {
+      case TYPE_DOUBLE:
+      case TYPE_S64:
+      case TYPE_U64:
+        return GEN_BYTE_SCATTER_QWORD;
+      case TYPE_FLOAT:
+      case TYPE_U32:
+      case TYPE_S32:
+        return GEN_BYTE_SCATTER_DWORD;
+      case TYPE_BOOL:
+      case TYPE_U16:
+      case TYPE_S16:
+        return GEN_BYTE_SCATTER_WORD;
+      case TYPE_U8:
+      case TYPE_S8:
+        return GEN_BYTE_SCATTER_BYTE;
+      default: NOT_SUPPORTED;
+        return GEN_BYTE_SCATTER_BYTE;
+    }
+  }
+
+  /*! Load instruction pattern */
+  DECL_PATTERN(LoadInstruction)
+  {
+    void readDWord(Selection::Opaque &sel,
+                   vector<GenRegister> &dst,
+                   vector<GenRegister> &dst2,
+                   GenRegister addr,
+                   uint32_t valueNum,
+                   ir::AddressSpace space,
+                   ir::BTI bti) const
+    {
+      for (uint32_t x = 0; x < bti.count; x++) {
+        if(x > 0)
+          for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+            dst2[dstID] = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+
+        GenRegister temp = getRelativeAddress(sel, addr, space, bti.bti[x]);
+        sel.UNTYPED_READ(temp, dst2.data(), valueNum, bti.bti[x]);
+        if(x > 0) {
+          sel.push();
+            if(sel.isScalarReg(dst[0].reg())) {
+              sel.curr.noMask = 1;
+              sel.curr.execWidth = 1;
+            }
+            for (uint32_t y = 0; y < valueNum; y++)
+              sel.ADD(dst[y], dst[y], dst2[y]);
+          sel.pop();
+        }
+      }
+    }
+
+    void emitUntypedRead(Selection::Opaque &sel,
+                         const ir::LoadInstruction &insn,
+                         GenRegister addr,
+                         ir::BTI bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      vector<GenRegister> dst(valueNum);
+      vector<GenRegister> dst2(valueNum);
+      for (uint32_t dstID = 0; dstID < valueNum; ++dstID)
+        dst2[dstID] = dst[dstID] = sel.selReg(insn.getValue(dstID), TYPE_U32);
+      readDWord(sel, dst, dst2, addr, valueNum, insn.getAddressSpace(), bti);
+    }
+
+    void emitDWordGather(Selection::Opaque &sel,
+                         const ir::LoadInstruction &insn,
+                         GenRegister addr,
+                         ir::BTI bti) const
+    {
+      using namespace ir;
+      GBE_ASSERT(bti.count == 1);
+      const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ? 1 : sel.ctx.getSimdWidth();
+      GBE_ASSERT(insn.getValueNum() == 1);
+
+      if(simdWidth == 1) {
+        GenRegister dst = sel.selReg(insn.getValue(0), ir::TYPE_U32);
+        sel.push();
+          sel.curr.noMask = 1;
+          sel.SAMPLE(&dst, 1, &addr, 1, bti.bti[0], 0, true, true);
+        sel.pop();
+        return;
+      }
+
+      GenRegister dst = GenRegister::retype(sel.selReg(insn.getValue(0)), GEN_TYPE_F);
+      // get dword based address
+      GenRegister addrDW = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+
+      sel.push();
+        if (sel.isScalarReg(addr.reg())) {
+          sel.curr.noMask = 1;
+        }
+        sel.SHR(addrDW, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(2));
+      sel.pop();
+
+      sel.DWORD_GATHER(dst, addrDW, bti.bti[0]);
+    }
+
+    void emitRead64(Selection::Opaque &sel,
+                         const ir::LoadInstruction &insn,
+                         GenRegister addr,
+                         ir::BTI bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      /* XXX support scalar only right now. */
+      GBE_ASSERT(valueNum == 1);
+      GBE_ASSERT(bti.count == 1);
+      GenRegister dst[valueNum];
+      GenRegister tmpAddr = getRelativeAddress(sel, addr, insn.getAddressSpace(), bti.bti[0]);
+      for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
+        dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
+      sel.READ64(tmpAddr, dst, valueNum, bti.bti[0]);
+    }
+
+    void readByteAsDWord(Selection::Opaque &sel,
+                        const uint32_t elemSize,
+                        GenRegister address,
+                        GenRegister dst,
+                        uint32_t simdWidth,
+                        uint8_t bti) const
+    {
+      using namespace ir;
+        Register tmpReg = sel.reg(FAMILY_DWORD, simdWidth == 1);
+        GenRegister tmpAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD, simdWidth == 1));
+        GenRegister tmpData = GenRegister::udxgrf(simdWidth, tmpReg);
+        // Get dword aligned addr
+        sel.push();
+          if (simdWidth == 1) {
+            sel.curr.execWidth = 1;
+            sel.curr.noMask = 1;
+          }
+          sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0xfffffffc));
+        sel.pop();
+        sel.push();
+          if (simdWidth == 1)
+            sel.curr.noMask = 1;
+          sel.UNTYPED_READ(tmpAddr, &tmpData, 1, bti);
+
+          if (simdWidth == 1)
+            sel.curr.execWidth = 1;
+          // Get the remaining offset from aligned addr
+          sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0x3));
+          sel.SHL(tmpAddr, tmpAddr, GenRegister::immud(0x3));
+          sel.SHR(tmpData, tmpData, tmpAddr);
+
+          if (elemSize == GEN_BYTE_SCATTER_WORD)
+            sel.MOV(GenRegister::retype(dst, GEN_TYPE_UW), sel.unpacked_uw(tmpReg));
+          else if (elemSize == GEN_BYTE_SCATTER_BYTE)
+            sel.MOV(GenRegister::retype(dst, GEN_TYPE_UB), sel.unpacked_ub(tmpReg));
+        sel.pop();
+    }
+
+    void emitByteGather(Selection::Opaque &sel,
+                        const ir::LoadInstruction &insn,
+                        const uint32_t elemSize,
+                        GenRegister address,
+                        ir::BTI bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
+                                 1 : sel.ctx.getSimdWidth();
+      RegisterFamily family = getFamily(insn.getValueType());
+
+      if(valueNum > 1) {
+        vector<GenRegister> dst(valueNum);
+        const uint32_t typeSize = getFamilySize(family);
+
+        for(uint32_t i = 0; i < valueNum; i++)
+          dst[i] = sel.selReg(insn.getValue(i), getType(family));
+
+        uint32_t tmpRegNum = typeSize*valueNum / 4;
+        vector<GenRegister> tmp(tmpRegNum);
+        vector<GenRegister> tmp2(tmpRegNum);
+        for(uint32_t i = 0; i < tmpRegNum; i++) {
+          tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+        }
+
+        readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
+
+        for(uint32_t i = 0; i < tmpRegNum; i++) {
+          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
+        }
+      } else {
+        GBE_ASSERT(insn.getValueNum() == 1);
+        const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType());
+        GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
+        GenRegister tmp = value;
+
+        for (int x = 0; x < bti.count; x++) {
+          if (x > 0)
+            tmp = sel.selReg(sel.reg(family, simdWidth == 1), insn.getValueType());
+
+          GenRegister addr = getRelativeAddress(sel, address, insn.getAddressSpace(), bti.bti[x]);
+          readByteAsDWord(sel, elemSize, addr, tmp, simdWidth, bti.bti[x]);
+          if (x > 0) {
+            sel.push();
+              if (simdWidth == 1) {
+                sel.curr.noMask = 1;
+                sel.curr.execWidth = 1;
+              }
+              sel.ADD(value, value, tmp);
+            sel.pop();
+          }
+        }
+      }
+    }
+
+    void emitIndirectMove(Selection::Opaque &sel,
+                         const ir::LoadInstruction &insn,
+                         GenRegister address) const
+    {
+      using namespace ir;
+      GBE_ASSERT(insn.getValueNum() == 1);   //todo: handle vec later
+
+      const GenRegister dst = sel.selReg(insn.getValue(0), insn.getValueType());
+      const GenRegister src = address;
+      sel.INDIRECT_MOVE(dst, src);
+    }
+
+    INLINE GenRegister getRelativeAddress(Selection::Opaque &sel, GenRegister address, ir::AddressSpace space, uint8_t bti) const {
+      if(space == ir::MEM_LOCAL || space == ir::MEM_CONSTANT)
+        return address;
+
+      sel.push();
+        sel.curr.noMask = 1;
+        GenRegister temp = sel.selReg(sel.reg(ir::FAMILY_DWORD), ir::TYPE_U32);
+        sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti), ir::TYPE_U32)));
+      sel.pop();
+      return temp;
+    }
+
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::LoadInstruction &insn, bool &markChildren) const {
+      using namespace ir;
+      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
+      const AddressSpace space = insn.getAddressSpace();
+      GBE_ASSERT(insn.getAddressSpace() == MEM_GLOBAL ||
+                 insn.getAddressSpace() == MEM_CONSTANT ||
+                 insn.getAddressSpace() == MEM_PRIVATE ||
+                 insn.getAddressSpace() == MEM_LOCAL);
+      //GBE_ASSERT(sel.isScalarReg(insn.getValue(0)) == false);
+      const Type type = insn.getValueType();
+      const uint32_t elemSize = getByteScatterGatherSize(type);
+      if(space == MEM_LOCAL && sel.needPatchSLMAddr()) {
+        GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+        sel.ADD(temp, address, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
+        address = temp;
+      }
+      BTI bti;
+      if (space == MEM_CONSTANT || space == MEM_LOCAL) {
+        bti.bti[0] = space == MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
+        bti.count = 1;
+      } else {
+        bti = insn.getBTI();
+      }
+      if (space == MEM_CONSTANT) {
+        // XXX TODO read 64bit constant through constant cache
+        // Per HW Spec, constant cache messages can read at least DWORD data.
+        // So, byte/short data type, we have to read through data cache.
+        if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+          this->emitRead64(sel, insn, address, bti);
+        else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+          this->emitDWordGather(sel, insn, address, bti);
+        else {
+          this->emitByteGather(sel, insn, elemSize, address, bti);
+        }
+      } else {
+        if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+          this->emitRead64(sel, insn, address, bti);
+        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+          this->emitUntypedRead(sel, insn, address, bti);
+        else {
+          this->emitByteGather(sel, insn, elemSize, address, bti);
+        }
+      }
+      return true;
+    }
+    DECL_CTOR(LoadInstruction, 1, 1);
+  };
+
+  /*! Store instruction pattern */
+  DECL_PATTERN(StoreInstruction)
+  {
+    void emitUntypedWrite(Selection::Opaque &sel,
+                          const ir::StoreInstruction &insn,
+                          GenRegister addr,
+                          uint32_t bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      vector<GenRegister> value(valueNum);
+
+      addr = GenRegister::retype(addr, GEN_TYPE_F);
+      for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
+        value[valueID] = GenRegister::retype(sel.selReg(insn.getValue(valueID)), GEN_TYPE_F);
+      sel.UNTYPED_WRITE(addr, value.data(), valueNum, bti);
+    }
+
+    void emitWrite64(Selection::Opaque &sel,
+                     const ir::StoreInstruction &insn,
+                     GenRegister addr,
+                     uint32_t bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      /* XXX support scalar only right now. */
+      GBE_ASSERT(valueNum == 1);
+      addr = GenRegister::retype(addr, GEN_TYPE_UD);
+      GenRegister src[valueNum];
+
+      for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
+        src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
+      sel.WRITE64(addr, src, valueNum, bti);
+    }
+
+    void emitByteScatter(Selection::Opaque &sel,
+                         const ir::StoreInstruction &insn,
+                         const uint32_t elemSize,
+                         GenRegister addr,
+                         uint32_t bti) const
+    {
+      using namespace ir;
+      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      uint32_t valueNum = insn.getValueNum();
+
+      if(valueNum > 1) {
+        const uint32_t typeSize = getFamilySize(getFamily(insn.getValueType()));
+        vector<GenRegister> value(valueNum);
+
+        if(elemSize == GEN_BYTE_SCATTER_WORD) {
+          for(uint32_t i = 0; i < valueNum; i++)
+            value[i] = sel.selReg(insn.getValue(i), ir::TYPE_U16);
+        } else if(elemSize == GEN_BYTE_SCATTER_BYTE) {
+          for(uint32_t i = 0; i < valueNum; i++)
+            value[i] = sel.selReg(insn.getValue(i), ir::TYPE_U8);
+        }
+
+        uint32_t tmpRegNum = typeSize*valueNum / 4;
+        vector<GenRegister> tmp(tmpRegNum);
+        for(uint32_t i = 0; i < tmpRegNum; i++) {
+          tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+          sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, 4/typeSize);
+        }
+
+        sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti);
+      } else {
+        const GenRegister value = sel.selReg(insn.getValue(0));
+        GBE_ASSERT(insn.getValueNum() == 1);
+        const GenRegister tmp = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+        if (elemSize == GEN_BYTE_SCATTER_WORD) {
+          sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UW));
+        } else if (elemSize == GEN_BYTE_SCATTER_BYTE) {
+          sel.MOV(tmp, GenRegister::retype(value, GEN_TYPE_UB));
+        }
+        sel.BYTE_SCATTER(addr, tmp, elemSize, bti);
+      }
+    }
+
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::StoreInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const AddressSpace space = insn.getAddressSpace();
+      const Type type = insn.getValueType();
+      const uint32_t elemSize = getByteScatterGatherSize(type);
+      GenRegister address = sel.selReg(insn.getAddress(), ir::TYPE_U32);
+      if(space == MEM_LOCAL && sel.needPatchSLMAddr()) {
+        GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+        sel.ADD(temp, address, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
+        address = temp;
+      }
+      if(space == MEM_LOCAL) {
+        if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+          this->emitWrite64(sel, insn, address, 0xfe);
+        else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+          this->emitUntypedWrite(sel, insn, address,  0xfe);
+        else
+          this->emitByteScatter(sel, insn, elemSize, address, 0xfe);
+      } else {
+        BTI bti = insn.getBTI();
+        for (int x = 0; x < bti.count; x++) {
+          GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+          sel.push();
+            sel.curr.noMask = 1;
+            sel.ADD(temp, address, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(bti.bti[x]), ir::TYPE_U32)));
+          sel.pop();
+          if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
+            this->emitWrite64(sel, insn, temp, bti.bti[x]);
+          else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
+            this->emitUntypedWrite(sel, insn, temp,  bti.bti[x]);
+          else {
+            this->emitByteScatter(sel, insn, elemSize, temp, bti.bti[x]);
+          }
+        }
+      }
+      return true;
+    }
+    DECL_CTOR(StoreInstruction, 1, 1);
+  };
+
+  /*! Compare instruction pattern */
+  class CompareInstructionPattern : public SelectionPattern
+  {
+  public:
+    CompareInstructionPattern(void) : SelectionPattern(1,1) {
+      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+        if (ir::isOpcodeFrom<ir::CompareInstruction>(ir::Opcode(op)) == true)
+          this->opcodes.push_back(ir::Opcode(op));
+    }
+
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+    {
+      using namespace ir;
+      const ir::CompareInstruction &insn = cast<CompareInstruction>(dag.insn);
+      const Opcode opcode = insn.getOpcode();
+      const Type type = insn.getType();
+      const Register dst = insn.getDst(0);
+      GenRegister tmpDst;
+      const BasicBlock *curr = insn.getParent();
+      const ir::Liveness &liveness = sel.ctx.getLiveness();
+      const ir::Liveness::LiveOut &liveOut = liveness.getLiveOut(curr);
+      bool needStoreBool = false;
+      if (liveOut.contains(dst) || dag.computeBool)
+        needStoreBool = true;
+
+      if(type == TYPE_S64 || type == TYPE_U64 ||
+         type == TYPE_DOUBLE || type == TYPE_FLOAT ||
+         type == TYPE_U32 ||  type == TYPE_S32 /*||
+         (!needStoreBool)*/)
+        tmpDst = GenRegister::retype(GenRegister::null(), GEN_TYPE_F);
+      else
+        tmpDst = sel.selReg(dst, TYPE_BOOL);
+
+      // Look for immediate values for the right source
+      GenRegister src0, src1;
+      bool inverseCmp = false;
+      sel.getSrcGenRegImm(dag, src0, src1, type, inverseCmp);
+      sel.push();
+        if (sel.isScalarReg(dst))
+          sel.curr.noMask = 1;
+        sel.curr.physicalFlag = 0;
+        sel.curr.modFlag = 1;
+        sel.curr.flagIndex = (uint16_t)dst;
+        sel.curr.grfFlag = needStoreBool; // indicate whether we need to allocate grf to store this boolean.
+        if (type == TYPE_S64 || type == TYPE_U64) {
+          GenRegister tmp[3];
+          for(int i=0; i<3; i++)
+            tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+          sel.curr.flagGen = 1;
+          sel.I64CMP(getGenCompare(opcode, inverseCmp), src0, src1, tmp);
+        } else if(opcode == OP_ORD) {
+          sel.push();
+            sel.CMP(GEN_CONDITIONAL_EQ, src0, src0, tmpDst);
+            sel.curr.predicate = GEN_PREDICATE_NORMAL;
+            sel.curr.flagGen = 1;
+            sel.CMP(GEN_CONDITIONAL_EQ, src1, src1, tmpDst);
+          sel.pop();
+        } else {
+          if((type == TYPE_S64 || type == TYPE_U64 ||
+              type == TYPE_DOUBLE || type == TYPE_FLOAT ||
+              type == TYPE_U32 ||  type == TYPE_S32))
+            sel.curr.flagGen = 1;
+          else if (sel.isScalarReg(dst)) {
+            // If the dest reg is a scalar bool, we can't set it as
+            // dst register, as the execution width is still 8 or 16.
+            // Instead, we set the needStoreBool to flagGen, and change
+            // the dst to null register. And let the flag reg allocation
+            // function to generate the flag grf on demand correctly latter.
+            sel.curr.flagGen = needStoreBool;
+            tmpDst = GenRegister::retype(GenRegister::null(), GEN_TYPE_UW);
+          }
+          sel.CMP(getGenCompare(opcode, inverseCmp), src0, src1, tmpDst);
+        }
+      sel.pop();
+      return true;
+    }
+  };
+
+  /*! Bit cast instruction pattern */
+  DECL_PATTERN(BitCastInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::BitCastInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const uint32_t dstNum = insn.getDstNum();
+      const uint32_t srcNum = insn.getSrcNum();
+      int index = 0, multiple, narrowNum;
+      bool narrowDst;
+      Type narrowType;
+
+      if(dstNum > srcNum) {
+        multiple = dstNum / srcNum;
+        narrowType = dstType;
+        narrowNum = dstNum;
+        narrowDst = 1;
+      } else {
+        multiple = srcNum / dstNum;
+        narrowType = srcType;
+        narrowNum = srcNum;
+        narrowDst = 0;
+      }
+
+      sel.push();
+      if (sel.isScalarReg(insn.getDst(0)) == true) {
+        sel.curr.execWidth = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+      }
+
+      // As we store long/ulong low/high part separately,
+      // we need to deal with it separately, we need to change it back again
+      // when hardware support native long type.
+      const bool isInt64 = (srcType == TYPE_S64 || srcType == TYPE_U64 || dstType == TYPE_S64 || dstType == TYPE_U64);
+      const int simdWidth = sel.curr.execWidth;
+
+      for(int i = 0; i < narrowNum; i++, index++) {
+        GenRegister narrowReg, wideReg;
+        if(narrowDst) {
+          narrowReg = sel.selReg(insn.getDst(i), narrowType);
+          wideReg = sel.selReg(insn.getSrc(index/multiple), narrowType);  //retype to narrow type
+        } else {
+          wideReg = sel.selReg(insn.getDst(index/multiple), narrowType);
+          narrowReg = sel.selReg(insn.getSrc(i), narrowType);  //retype to narrow type
+        }
+
+        // set correct horizontal stride
+        if(wideReg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+          if(multiple == 2) {
+            wideReg = sel.unpacked_uw(wideReg.reg());
+            wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
+            if(isInt64) {
+              wideReg.hstride = GEN_HORIZONTAL_STRIDE_1;
+              wideReg.vstride = GEN_VERTICAL_STRIDE_8;
+            }
+          } else if(multiple == 4) {
+            wideReg = sel.unpacked_ub(wideReg.reg());
+            wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
+            if(isInt64) {
+              wideReg.hstride = GEN_HORIZONTAL_STRIDE_2;
+              wideReg.vstride = GEN_VERTICAL_STRIDE_16;
+            }
+          } else if(multiple == 8) {
+            // we currently store high/low 32bit separately in register,
+            // so, its hstride is 4 here.
+            wideReg = sel.unpacked_ub(wideReg.reg());
+            wideReg = GenRegister::retype(wideReg, getGenType(narrowType));
+          } else {
+            GBE_ASSERT(0);
+          }
+        }
+
+        if(!isInt64 && index % multiple) {
+          wideReg = GenRegister::offset(wideReg, 0, (index % multiple) * typeSize(wideReg.type));
+          wideReg.subphysical = 1;
+        }
+        if(isInt64) {
+          wideReg.subphysical = 1;
+          // Offset to next half
+          if((i % multiple) >= multiple/2)
+            wideReg = GenRegister::offset(wideReg, 0, sel.isScalarReg(wideReg.reg()) ? 4 : simdWidth*4);
+          // Offset to desired narrow element in wideReg
+          if(index % (multiple/2))
+            wideReg = GenRegister::offset(wideReg, 0, (index % (multiple/2)) * typeSize(wideReg.type));
+        }
+
+        GenRegister xdst = narrowDst ? narrowReg : wideReg;
+        GenRegister xsrc = narrowDst ? wideReg : narrowReg;
+
+        if(isInt64) {
+          sel.MOV(xdst, xsrc);
+        } else if(srcType == TYPE_DOUBLE || dstType == TYPE_DOUBLE) {
+          sel.push();
+            sel.curr.execWidth = 8;
+            xdst.subphysical = 1;
+            xsrc.subphysical = 1;
+            for(int i = 0; i < simdWidth/4; i ++) {
+              sel.curr.chooseNib(i);
+              sel.MOV(xdst, xsrc);
+              xdst = GenRegister::offset(xdst, 0, 4 * typeSize(getGenType(dstType)));
+              xsrc = GenRegister::offset(xsrc, 0, 4 * typeSize(getGenType(srcType)));
+            }
+          sel.pop();
+        } else
+          sel.MOV(xdst, xsrc);
+      }
+      sel.pop();
+
+      return true;
+    }
+    DECL_CTOR(BitCastInstruction, 1, 1);
+  };
+
+  /*! Convert instruction pattern */
+  DECL_PATTERN(ConvertInstruction)
+  {
+
+    INLINE bool lowerI64Reg(Selection::Opaque &sel, SelectionDAG *dag, GenRegister &src, uint32_t type) const {
+      using namespace ir;
+      GBE_ASSERT(type == GEN_TYPE_UD || type == GEN_TYPE_F);
+      if (dag->insn.getOpcode() == OP_LOADI) {
+        const auto &immInsn = cast<LoadImmInstruction>(dag->insn);
+        const auto imm = immInsn.getImmediate();
+        const Type immType = immInsn.getType();
+        if (immType == TYPE_S64 &&
+          imm.getIntegerValue() <= INT_MAX &&
+          imm.getIntegerValue() >= INT_MIN) {
+          src = GenRegister::immd((int32_t)imm.getIntegerValue());
+          return true;
+        } else if (immType == TYPE_U64 &&
+                   imm.getIntegerValue() <= UINT_MAX) {
+          src = GenRegister::immud((uint32_t)imm.getIntegerValue());
+          return true;
+        }
+      } else if (dag->insn.getOpcode() == OP_CVT) {
+        const auto cvtInsn = cast<ConvertInstruction>(dag->insn);
+        auto srcType = cvtInsn.getSrcType();
+        if (((srcType == TYPE_U32 || srcType == TYPE_S32) &&
+            (type == GEN_TYPE_UD || type == GEN_TYPE_D)) ||
+             ((srcType == TYPE_FLOAT) && type == GEN_TYPE_F)) {
+          src = GenRegister::retype(sel.selReg(cvtInsn.getSrc(0), srcType), type);
+          dag->isRoot = 1;
+          return true;
+        } else if (srcType == TYPE_FLOAT ||
+                   srcType == TYPE_U16 ||
+                   srcType == TYPE_S16 ||
+                   srcType == TYPE_U32 ||
+                   srcType == TYPE_S32) {
+          src = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32), type);
+          dag->isRoot = 1;
+          sel.MOV(src, sel.selReg(cvtInsn.getSrc(0), srcType));
+          return true;
+        }
+      }
+      return false;
+    }
+
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::ConvertInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const Type dstType = insn.getDstType();
+      const Type srcType = insn.getSrcType();
+      const RegisterFamily dstFamily = getFamily(dstType);
+      const RegisterFamily srcFamily = getFamily(srcType);
+      const GenRegister dst = sel.selReg(insn.getDst(0), dstType);
+      const GenRegister src = sel.selReg(insn.getSrc(0), srcType);
+      const Opcode opcode = insn.getOpcode();
+      sel.push();
+        if (sel.isScalarReg(insn.getDst(0)) == true) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+      if(opcode == ir::OP_SAT_CVT)
+        sel.curr.saturate = 1;
+
+      // We need two instructions to make the conversion
+      if (opcode == OP_F16TO32) {
+        sel.F16TO32(dst, src);
+      } else if (opcode == OP_F32TO16) {
+        GenRegister unpacked;
+        unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+        sel.push();
+          if (sel.isScalarReg(insn.getSrc(0))) {
+            sel.curr.execWidth = 1;
+            sel.curr.predicate = GEN_PREDICATE_NONE;
+            sel.curr.noMask = 1;
+          }
+          sel.F32TO16(unpacked, src);
+        sel.pop();
+        sel.MOV(dst, unpacked);
+      } else if (dstFamily != FAMILY_DWORD && dstFamily != FAMILY_QWORD && (srcFamily == FAMILY_DWORD || srcFamily == FAMILY_QWORD)) {
+        GenRegister unpacked;
+        if (dstFamily == FAMILY_WORD) {
+          const uint32_t type = dstType == TYPE_U16 ? GEN_TYPE_UW : GEN_TYPE_W;
+          if (!sel.isScalarReg(dst.reg())) {
+            unpacked = sel.unpacked_uw(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, type);
+          } else
+            unpacked = GenRegister::retype(sel.unpacked_uw(dst.reg()), type);
+        } else {
+          const uint32_t type = dstType == TYPE_U8 ? GEN_TYPE_UB : GEN_TYPE_B;
+          if (!sel.isScalarReg(dst.reg())) {
+            unpacked = sel.unpacked_ub(sel.reg(FAMILY_DWORD, sel.isScalarReg(insn.getSrc(0))));
+            unpacked = GenRegister::retype(unpacked, type);
+          } else
+            unpacked = GenRegister::retype(sel.unpacked_ub(dst.reg()), type);
+        }
+        if(srcFamily == FAMILY_QWORD) {
+          GenRegister tmp = sel.selReg(sel.reg(FAMILY_DWORD));
+          tmp.type = GEN_TYPE_D;
+          sel.CONVI64_TO_I(tmp, src);
+          sel.MOV(unpacked, tmp);
+        } else {
+          sel.push();
+            if (sel.isScalarReg(insn.getSrc(0))) {
+              sel.curr.execWidth = 1;
+              sel.curr.predicate = GEN_PREDICATE_NONE;
+              sel.curr.noMask = 1;
+            }
+            sel.MOV(unpacked, src);
+          sel.pop();
+        }
+        if (unpacked.reg() != dst.reg())
+          sel.MOV(dst, unpacked);
+      } else if ((dstType == ir::TYPE_S32 || dstType == ir::TYPE_U32) &&
+                 (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64))
+        sel.CONVI64_TO_I(dst, src);
+      else if (dstType == ir::TYPE_FLOAT && (srcType == ir::TYPE_U64 || srcType == ir::TYPE_S64)) {
+        auto dag = sel.regDAG[src.reg()];
+        // FIXME, in the future, we need to do a common I64 lower to I32 analysis
+        // at llvm IR layer which could cover more cases then just this one.
+        SelectionDAG *dag0, *dag1;
+        if (dag && dag->child[0] && dag->child[1]) {
+          if (dag->child[0]->insn.getOpcode() == OP_LOADI) {
+            dag0 = dag->child[1];
+            dag1 = dag->child[0];
+          } else {
+            dag0 = dag->child[0];
+            dag1 = dag->child[1];
+          }
+          GBE_ASSERT(!(dag->child[0]->insn.getOpcode() == OP_LOADI &&
+                       dag->child[1]->insn.getOpcode() == OP_LOADI));
+          if (dag->insn.getOpcode() == OP_AND ||
+              dag->insn.getOpcode() == OP_OR  ||
+              dag->insn.getOpcode() == OP_XOR) {
+            GenRegister src0;
+            GenRegister src1;
+            if (lowerI64Reg(sel, dag0, src0, GEN_TYPE_UD) &&
+                lowerI64Reg(sel, dag1, src1, GEN_TYPE_UD)) {
+              switch (dag->insn.getOpcode()) {
+                default:
+                case OP_AND: sel.AND(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
+                case OP_OR:  sel.OR(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
+                case OP_XOR: sel.XOR(GenRegister::retype(dst, GEN_TYPE_UD), src0, src1); break;
+              }
+              sel.MOV(dst, GenRegister::retype(dst, GEN_TYPE_UD));
+              markChildren = false;
+              return true;
+            }
+          }
+        }
+        GenRegister tmp[6];
+        for(int i=0; i<6; i++) {
+          tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        }
+        sel.push();
+          sel.curr.flag = 0;
+          sel.curr.subFlag = 1;
+          sel.CONVI64_TO_F(dst, src, tmp);
+        sel.pop();
+      } else if ((dst.isdf() && srcType == ir::TYPE_FLOAT) ||
+                 (src.isdf() && dstType == ir::TYPE_FLOAT)) {
+        ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
+        sel.MOV_DF(dst, src, sel.selReg(r));
+      } else if (dst.isint64()) {
+        switch(src.type) {
+          case GEN_TYPE_F:
+          {
+            GenRegister tmp[2];
+            tmp[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+            tmp[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_FLOAT);
+            sel.push();
+              sel.curr.flag = 0;
+              sel.curr.subFlag = 1;
+              sel.CONVF_TO_I64(dst, src, tmp);
+            sel.pop();
+            break;
+          }
+          case GEN_TYPE_DF:
+            NOT_IMPLEMENTED;
+          default:
+            sel.CONVI_TO_I64(dst, src, sel.selReg(sel.reg(FAMILY_DWORD)));
+        }
+      } else
+        sel.MOV(dst, src);
+
+      sel.pop();
+
+      return true;
+    }
+    DECL_CTOR(ConvertInstruction, 1, 1);
+  };
+
+  /*! Convert instruction pattern */
+  DECL_PATTERN(AtomicInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::AtomicInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const AtomicOps atomicOp = insn.getAtomicOpcode();
+      const AddressSpace space = insn.getAddressSpace();
+      const uint32_t srcNum = insn.getSrcNum();
+
+      GenRegister src0 = sel.selReg(insn.getSrc(0), TYPE_U32);   //address
+      GenRegister src1 = src0, src2 = src0;
+      if(srcNum > 1) src1 = sel.selReg(insn.getSrc(1), TYPE_U32);
+      if(srcNum > 2) src2 = sel.selReg(insn.getSrc(2), TYPE_U32);
+      GenRegister dst  = sel.selReg(insn.getDst(0), TYPE_U32);
+      GenAtomicOpCode genAtomicOp = (GenAtomicOpCode)atomicOp;
+      if(space == MEM_LOCAL) {
+        if (sel.needPatchSLMAddr()) {
+          GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+          sel.ADD(temp, src0, sel.selReg(ocl::slmoffset, ir::TYPE_U32));
+          src0 = temp;
+        }
+        sel.ATOMIC(dst, genAtomicOp, srcNum, src0, src1, src2, 0xfe);
+      } else {
+        ir::BTI b = insn.getBTI();
+        for (int x = 0; x < b.count; x++) {
+          sel.push();
+            sel.curr.noMask = 1;
+            GenRegister temp = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+            sel.ADD(temp, src0, GenRegister::negate(sel.selReg(sel.ctx.getSurfaceBaseReg(b.bti[x]), ir::TYPE_U32)));
+          sel.pop();
+          sel.ATOMIC(dst, genAtomicOp, srcNum, temp, src1, src2, b.bti[x]);
+        }
+      }
+      return true;
+    }
+    DECL_CTOR(AtomicInstruction, 1, 1);
+  };
+
+  /*! Select instruction pattern */
+  class SelectInstructionPattern : public SelectionPattern
+  {
+  public:
+    SelectInstructionPattern(void) : SelectionPattern(1,1) {
+      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+        if (ir::isOpcodeFrom<ir::SelectInstruction>(ir::Opcode(op)) == true)
+          this->opcodes.push_back(ir::Opcode(op));
+    }
+
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const
+    {
+      using namespace ir;
+      const ir::SelectInstruction &insn = cast<SelectInstruction>(dag.insn);
+
+      // Get all registers for the instruction
+      const Type type = insn.getType();
+      const GenRegister dst  = sel.selReg(insn.getDst(0), type);
+
+      // Look for immediate values for the right source
+      GenRegister src0, src1;
+      SelectionDAG *dag0 = dag.child[0]; // source 0 is the predicate!
+      SelectionDAG *dag1 = dag.child[1];
+      SelectionDAG *dag2 = dag.child[2];
+
+      if (dag0) dag0->isRoot = 1;
+      bool inverse = false;
+      sel.getSrcGenRegImm(dag, dag1, dag2, src0, src1, type, inverse);
+      const Register pred = insn.getPredicate();
+      sel.push();
+        if (sel.isScalarReg(insn.getDst(0)) == true) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+        sel.curr.inversePredicate ^= inverse;
+        sel.curr.physicalFlag = 0;
+        sel.curr.flagIndex = (uint16_t) pred;
+        sel.curr.predicate = GEN_PREDICATE_NORMAL;
+        if (!dag0)
+          sel.curr.externFlag = 1;
+        if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
+          sel.SEL_INT64(dst, src0, src1);
+        else
+          sel.SEL(dst, src0, src1);
+      sel.pop();
+
+      return true;
+    }
+  };
+
+  DECL_PATTERN(TernaryInstruction)
+   {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::TernaryInstruction &insn, bool &markChildren) const {
+      using namespace ir;
+      const Type type = insn.getType();
+      const GenRegister dst = sel.selReg(insn.getDst(0), type),
+                        src0 = sel.selReg(insn.getSrc(0), type),
+                        src1 = sel.selReg(insn.getSrc(1), type),
+                        src2 = sel.selReg(insn.getSrc(2), type);
+      switch(insn.getOpcode()) {
+        case OP_I64MADSAT:
+         {
+          GenRegister tmp[9];
+          for(int i=0; i<9; i++) {
+            tmp[i] = sel.selReg(sel.reg(FAMILY_DWORD));
+            tmp[i].type = GEN_TYPE_UD;
+          }
+          sel.push();
+            sel.curr.flag = 0;
+            sel.curr.subFlag = 1;
+            sel.I64MADSAT(dst, src0, src1, src2, tmp);
+          sel.pop();
+          break;
+         }
+        case OP_MAD:
+         {
+          sel.MAD(dst, src2, src0, src1);
+          break;
+         }
+        default:
+          NOT_IMPLEMENTED;
+      }
+      return true;
+    }
+
+    DECL_CTOR(TernaryInstruction, 1, 1);
+   };
+
+
+  /*! Label instruction pattern */
+  DECL_PATTERN(LabelInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::LabelInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const LabelIndex label = insn.getLabelIndex();
+      const GenRegister src0 = sel.selReg(ocl::blockip);
+      const GenRegister src1 = GenRegister::immuw(label);
+      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      GBE_ASSERTM(label < GEN_MAX_LABEL, "We reached the maximum label number which is reserved for barrier handling");
+      sel.LABEL(label);
+
+      // Do not emit any code for the "returning" block. There is no need for it
+      if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
+        return true;
+
+      LabelIndex jip;
+      const LabelIndex nextLabel = insn.getParent()->getNextBlock()->getLabelIndex();
+      if (sel.ctx.hasJIP(&insn))
+        jip = sel.ctx.getLabelIndex(&insn);
+      else
+        jip = nextLabel;
+
+      // Emit the mask computation at the head of each basic block
+      sel.push();
+        sel.curr.noMask = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.CMP(GEN_CONDITIONAL_LE, GenRegister::retype(src0, GEN_TYPE_UW), src1,
+                GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+      sel.pop();
+
+      if (sel.block->hasBarrier) {
+        // If this block has barrier, we don't execute the block until all lanes
+        // are 1s. Set each reached lane to 1, then check all lanes. If there is any
+        // lane not reached, we jump to jip. And no need to issue if/endif for
+        // this block, as it will always excute with all lanes activated.
+        sel.push();
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL));
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          sel.CMP(GEN_CONDITIONAL_EQ, GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw(GEN_MAX_LABEL),
+                  GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+          if (simdWidth == 8)
+            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H;
+          else if (simdWidth == 16)
+            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H;
+          else
+            NOT_IMPLEMENTED;
+          sel.curr.noMask = 1;
+          sel.curr.execWidth = 1;
+          sel.curr.inversePredicate = 1;
+          sel.JMPI(GenRegister::immd(0), jip, label);
+        sel.pop();
+        // FIXME, if the last BRA is unconditional jump, we don't need to update the label here.
+        sel.push();
+         sel.curr.predicate = GEN_PREDICATE_NORMAL;
+         sel.MOV(GenRegister::retype(src0, GEN_TYPE_UW), GenRegister::immuw((uint16_t)label));
+        sel.pop();
+      }
+      else {
+        if (sel.ctx.hasJIP(&insn) &&
+            // If jump to next label and the endif offset is -1, then
+            // We don't need to add a jmpi here, as the following IF will do the same
+            // thing if all channels are disabled.
+            (jip != nextLabel || sel.block->endifOffset != -1)) {
+          // If it is required, insert a JUMP to bypass the block
+          sel.push();
+            if (simdWidth == 8)
+              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+            else if (simdWidth == 16)
+              sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+            else
+              NOT_IMPLEMENTED;
+            sel.curr.noMask = 1;
+            sel.curr.execWidth = 1;
+            sel.curr.inversePredicate = 1;
+            sel.JMPI(GenRegister::immd(0), jip, label);
+          sel.pop();
+        }
+        sel.push();
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          sel.IF(GenRegister::immd(0), sel.block->endifLabel, sel.block->endifLabel);
+        sel.pop();
+      }
+
+      return true;
+    }
+    DECL_CTOR(LabelInstruction, 1, 1);
+  };
+
+  DECL_PATTERN(SampleInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::SampleInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      GenRegister msgPayloads[4];
+      GenRegister dst[insn.getDstNum()];
+      uint32_t srcNum = insn.getSrcNum();
+      uint32_t valueID = 0;
+      uint32_t msgLen = 0;
+
+      for (valueID = 0; valueID < insn.getDstNum(); ++valueID)
+        dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
+
+      GBE_ASSERT(srcNum == 3);
+      if (insn.getSrc(1) == ir::ocl::invalid) //not 3D
+        srcNum = 1;
+      else if (insn.getSrc(2) == ir::ocl::invalid)
+        srcNum = 2;
+
+      if (insn.getSamplerOffset() != 0) {
+        // U, lod, [V], [W]
+        GBE_ASSERT(insn.getSrcType() != TYPE_FLOAT);
+        msgPayloads[0] = sel.selReg(insn.getSrc(0), insn.getSrcType());
+        msgPayloads[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        if (srcNum > 1)
+          msgPayloads[2] = sel.selReg(insn.getSrc(1), insn.getSrcType());
+        if (srcNum > 2)
+          msgPayloads[3] = sel.selReg(insn.getSrc(2), insn.getSrcType());
+        // Clear the lod to zero.
+        sel.MOV(msgPayloads[1], GenRegister::immud(0));
+        msgLen = srcNum + 1;
+      } else {
+        // U, V, [W]
+        GBE_ASSERT(insn.getSrcType() == TYPE_FLOAT);
+        for (valueID = 0; valueID < srcNum; ++valueID)
+          msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+        msgLen = srcNum;
+      }
+      // We switch to a fixup bti for linear filter on a image1d array sampling.
+      uint32_t bti = insn.getImageIndex() + (insn.getSamplerOffset() == 2 ? BTI_MAX_IMAGE_NUM : 0);
+      if (bti > 253) {
+        std::cerr << "Too large bti " << bti;
+        return false;
+      }
+      uint32_t sampler = insn.getSamplerIndex();
+
+      sel.SAMPLE(dst, insn.getDstNum(), msgPayloads, msgLen, bti, sampler, insn.getSamplerOffset() != 0, false);
+      return true;
+    }
+    DECL_CTOR(SampleInstruction, 1, 1);
+  };
+
+  /*! Typed write instruction pattern. */
+  DECL_PATTERN(TypedWriteInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::TypedWriteInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      GenRegister msgs[9]; // (header + U + V + R + LOD + 4)
+      const uint32_t msgNum = (8 / (simdWidth / 8)) + 1;
+      const uint32_t coordNum = 3;
+
+      if (simdWidth == 16) {
+        for(uint32_t i = 0; i < msgNum; i++)
+          msgs[i] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+      } else {
+        uint32_t valueID = 0;
+        msgs[0] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        for(uint32_t msgID = 1; msgID < 1 + coordNum; msgID++, valueID++)
+          msgs[msgID] = sel.selReg(insn.getSrc(msgID - 1), insn.getCoordType());
+
+        // fake u.
+        if (insn.getSrc(1) == ir::ocl::invalid)
+          msgs[2] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        // fake w.
+        if (insn.getSrc(2) == ir::ocl::invalid)
+          msgs[3] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        // LOD.
+        msgs[4] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+        for(uint32_t msgID = 5; valueID < insn.getSrcNum(); msgID++, valueID++)
+          msgs[msgID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+      }
+
+      sel.push();
+      sel.curr.predicate = GEN_PREDICATE_NONE;
+      sel.curr.noMask = 1;
+      sel.MOV(msgs[0], GenRegister::immud(0));
+      sel.curr.execWidth = 1;
+
+      GenRegister channelEn = GenRegister::offset(msgs[0], 0, 7*4);
+      channelEn.subphysical = 1;
+      // Enable all channels.
+      sel.MOV(channelEn, GenRegister::immud(0xffff));
+      sel.curr.execWidth = 8;
+      // Set zero LOD.
+      if (simdWidth == 8)
+        sel.MOV(msgs[4], GenRegister::immud(0));
+      else
+        sel.MOV(GenRegister::Qn(msgs[2], 0), GenRegister::immud(0));
+      sel.pop();
+
+      uint32_t bti = insn.getImageIndex();
+      if (simdWidth == 8)
+        sel.TYPED_WRITE(msgs, msgNum, bti, insn.getSrc(2) != ir::ocl::invalid);
+      else {
+        sel.push();
+        sel.curr.execWidth = 8;
+        for( uint32_t quarter = 0; quarter < 2; quarter++)
+        {
+          #define QUARTER_MOV0(msgs, msgid, src) \
+                    sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], GEN_TYPE_UD), msgid % 2), \
+                            GenRegister::Qn(src, quarter))
+
+          #define QUARTER_MOV1(msgs, msgid, src) \
+                  sel.MOV(GenRegister::Qn(GenRegister::retype(msgs[msgid/2], src.type), msgid % 2), \
+                          GenRegister::Qn(src, quarter))
+          sel.curr.quarterControl = (quarter == 0) ? GEN_COMPRESSION_Q1 : GEN_COMPRESSION_Q2;
+          // Set U,V,W
+          QUARTER_MOV0(msgs, 1, sel.selReg(insn.getSrc(0), insn.getCoordType()));
+          if (insn.getSrc(1) != ir::ocl::invalid) //not 2D
+            QUARTER_MOV0(msgs, 2, sel.selReg(insn.getSrc(1), insn.getCoordType()));
+          if (insn.getSrc(2) != ir::ocl::invalid) //not 3D
+            QUARTER_MOV0(msgs, 3, sel.selReg(insn.getSrc(2), insn.getCoordType()));
+          // Set R, G, B, A
+          QUARTER_MOV1(msgs, 5, sel.selReg(insn.getSrc(3), insn.getSrcType()));
+          QUARTER_MOV1(msgs, 6, sel.selReg(insn.getSrc(4), insn.getSrcType()));
+          QUARTER_MOV1(msgs, 7, sel.selReg(insn.getSrc(5), insn.getSrcType()));
+          QUARTER_MOV1(msgs, 8, sel.selReg(insn.getSrc(6), insn.getSrcType()));
+          sel.TYPED_WRITE(msgs, msgNum, bti, insn.getSrc(2) != ir::ocl::invalid);
+          #undef QUARTER_MOV0
+          #undef QUARTER_MOV1
+        }
+        sel.pop();
+      }
+      return true;
+    }
+    DECL_CTOR(TypedWriteInstruction, 1, 1);
+  };
+
+  /*! get image info instruction pattern. */
+  DECL_PATTERN(GetImageInfoInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::GetImageInfoInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      GenRegister dst;
+      dst = sel.selReg(insn.getDst(0), TYPE_U32);
+      GenRegister imageInfoReg = GenRegister::ud1grf(insn.getSrc(0));
+      sel.MOV(dst, imageInfoReg);
+
+      return true;
+    }
+    DECL_CTOR(GetImageInfoInstruction, 1, 1);
+  };
+
+  /*! Branch instruction pattern */
+  class BranchInstructionPattern : public SelectionPattern
+  {
+  public:
+    BranchInstructionPattern(void) : SelectionPattern(1,1) {
+      for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+        if (ir::isOpcodeFrom<ir::BranchInstruction>(ir::Opcode(op)) == true)
+          this->opcodes.push_back(ir::Opcode(op));
+    }
+
+    void emitForwardBranch(Selection::Opaque &sel,
+                           const ir::BranchInstruction &insn,
+                           ir::LabelIndex dst,
+                           ir::LabelIndex src) const
+    {
+      using namespace ir;
+      const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
+
+      // We will not emit any jump if we must go the next block anyway
+      const BasicBlock *curr = insn.getParent();
+      const BasicBlock *next = curr->getNextBlock();
+      const LabelIndex nextLabel = next->getLabelIndex();
+      if (insn.isPredicated() == true) {
+        const Register pred = insn.getPredicateIndex();
+        sel.push();
+          // we don't need to set next label to the pcip
+          // as if there is no backward jump latter, then obviously everything will work fine.
+          // If there is backward jump latter, then all the pcip will be updated correctly there.
+          sel.curr.physicalFlag = 0;
+          sel.curr.flagIndex = (uint16_t) pred;
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          if (!sel.block->hasBarrier)
+            sel.ENDIF(GenRegister::immd(0), nextLabel);
+          sel.block->endifOffset = -1;
+        sel.pop();
+      } else {
+        // Update the PcIPs
+        const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
+        sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+        if (!sel.block->hasBarrier)
+          sel.ENDIF(GenRegister::immd(0), nextLabel);
+        sel.block->endifOffset = -1;
+        if (nextLabel == jip) return;
+        // Branch to the jump target
+        sel.push();
+          sel.curr.execWidth = 1;
+          sel.curr.noMask = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.block->endifOffset -= sel.JMPI(GenRegister::immd(0), jip, curr->getLabelIndex());
+        sel.pop();
+      }
+    }
+
+    void emitBackwardBranch(Selection::Opaque &sel,
+                            const ir::BranchInstruction &insn,
+                            ir::LabelIndex dst,
+                            ir::LabelIndex src) const
+    {
+      using namespace ir;
+      const GenRegister ip = sel.selReg(ocl::blockip, TYPE_U16);
+      const Function &fn = sel.ctx.getFunction();
+      const BasicBlock &bb = fn.getBlock(src);
+      const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
+      const LabelIndex label = bb.getLabelIndex();
+      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      GBE_ASSERT(bb.getNextBlock() != NULL);
+
+      if (insn.isPredicated() == true) {
+        const Register pred = insn.getPredicateIndex();
+
+        // Update the PcIPs for all the branches. Just put the IPs of the next
+        // block. Next instruction will properly update the IPs of the lanes
+        // that actually take the branch
+        const LabelIndex next = bb.getNextBlock()->getLabelIndex();
+        sel.MOV(ip, GenRegister::immuw(uint16_t(next)));
+        GBE_ASSERT(jip == dst);
+        sel.push();
+          sel.curr.physicalFlag = 0;
+          sel.curr.flagIndex = (uint16_t) pred;
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+          sel.block->endifOffset = -1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          if (!sel.block->hasBarrier)
+            sel.ENDIF(GenRegister::immd(0), next);
+          sel.curr.execWidth = 1;
+          if (simdWidth == 16)
+            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H;
+          else
+            sel.curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H;
+          sel.curr.noMask = 1;
+          sel.block->endifOffset -= sel.JMPI(GenRegister::immd(0), jip, label);
+        sel.pop();
+      } else {
+        const LabelIndex next = bb.getNextBlock()->getLabelIndex();
+        // Update the PcIPs
+        sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+        sel.block->endifOffset = -1;
+        if (!sel.block->hasBarrier)
+          sel.ENDIF(GenRegister::immd(0), next);
+        // Branch to the jump target
+        sel.push();
+          sel.curr.execWidth = 1;
+          sel.curr.noMask = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.block->endifOffset -= sel.JMPI(GenRegister::immd(0), jip, label);
+        sel.pop();
+      }
+    }
+
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+      using namespace ir;
+      const ir::BranchInstruction &insn = cast<BranchInstruction>(dag.insn);
+      const Opcode opcode = insn.getOpcode();
+      if (opcode == OP_RET)
+        sel.EOT();
+      else if (opcode == OP_BRA) {
+        const LabelIndex dst = insn.getLabelIndex();
+        const LabelIndex src = insn.getParent()->getLabelIndex();
+
+        sel.push();
+        if (insn.isPredicated() == true) {
+          if (dag.child[0] == NULL)
+            sel.curr.externFlag = 1;
+        }
+
+        // We handle foward and backward branches differently
+        if (uint32_t(dst) <= uint32_t(src))
+          this->emitBackwardBranch(sel, insn, dst, src);
+        else
+          this->emitForwardBranch(sel, insn, dst, src);
+        sel.pop();
+      } else
+        NOT_IMPLEMENTED;
+
+      markAllChildren(dag);
+      return true;
+    }
+
+  };
+
+  /*! Sort patterns */
+  INLINE bool cmp(const SelectionPattern *p0, const SelectionPattern *p1) {
+    if (p0->insnNum != p1->insnNum)
+      return p0->insnNum > p1->insnNum;
+    return p0->cost < p1->cost;
+  }
+
+  SelectionLibrary::SelectionLibrary(void) {
+    this->insert<UnaryInstructionPattern>();
+    this->insert<BinaryInstructionPattern>();
+    this->insert<TypedWriteInstructionPattern>();
+    this->insert<SyncInstructionPattern>();
+    this->insert<LoadImmInstructionPattern>();
+    this->insert<LoadInstructionPattern>();
+    this->insert<StoreInstructionPattern>();
+    this->insert<SelectInstructionPattern>();
+    this->insert<CompareInstructionPattern>();
+    this->insert<BitCastInstructionPattern>();
+    this->insert<ConvertInstructionPattern>();
+    this->insert<AtomicInstructionPattern>();
+    this->insert<TernaryInstructionPattern>();
+    this->insert<LabelInstructionPattern>();
+    this->insert<BranchInstructionPattern>();
+    this->insert<Int32x32MulInstructionPattern>();
+    this->insert<Int32x16MulInstructionPattern>();
+    this->insert<MulAddInstructionPattern>();
+    this->insert<SelectModifierInstructionPattern>();
+    this->insert<SampleInstructionPattern>();
+    this->insert<GetImageInfoInstructionPattern>();
+
+    // Sort all the patterns with the number of instructions they output
+    for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
+      std::sort(this->patterns[op].begin(), this->patterns[op].end(), cmp);
+  }
+
+  SelectionLibrary::~SelectionLibrary(void) {
+    for (auto pattern : this->toFree)
+      GBE_DELETE(const_cast<SelectionPattern*>(pattern));
+  }
+
+  template <typename PatternType>
+  void SelectionLibrary::insert(void) {
+    const SelectionPattern *pattern = GBE_NEW_NO_ARG(PatternType);
+    this->toFree.push_back(pattern);
+    for (auto opcode : pattern->opcodes)
+      this->patterns[opcode].push_back(pattern);
+  }
+
+} /* namespace gbe */
+
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
new file mode 100644
index 0000000..9bcce6f
--- /dev/null
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -0,0 +1,290 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_insn_selection.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GEN_INSN_SELECTION_HPP__
+#define __GEN_INSN_SELECTION_HPP__
+
+#include "ir/register.hpp"
+#include "ir/instruction.hpp"
+#include "backend/gen_register.hpp"
+#include "backend/gen_encoder.hpp"
+#include "backend/gen_context.hpp"
+#include "backend/gen_reg_allocation.hpp"
+#include "sys/vector.hpp"
+#include "sys/intrusive_list.hpp"
+
+namespace gbe
+{
+  /*! Translate IR type to Gen type */
+  uint32_t getGenType(ir::Type type);
+  /*! Translate Gen type to IR type */
+  ir::Type getIRType(uint32_t genType);
+
+  /*! Translate IR compare to Gen compare */
+  uint32_t getGenCompare(ir::Opcode opcode);
+
+  #define GEN_MAX_LABEL 0xFFFF
+
+  /*! Selection opcodes properly encoded from 0 to n for fast jump tables
+   *  generations
+   */
+  enum SelectionOpcode {
+#define DECL_SELECTION_IR(OP, FN) SEL_OP_##OP,
+#include "backend/gen_insn_selection.hxx"
+#undef DECL_SELECTION_IR
+  };
+
+  // Owns and Allocates selection instructions
+  class Selection;
+
+  // List of SelectionInstruction forms a block
+  class SelectionBlock;
+
+  /*! A selection instruction is also almost a Gen instruction but *before* the
+   *  register allocation
+   */
+  class SelectionInstruction : public NonCopyable, public intrusive_list_node
+  {
+  public:
+    /*! Owns the instruction */
+    SelectionBlock *parent;
+    /*! Append an instruction before this one */
+    void prepend(SelectionInstruction &insn);
+    /*! Append an instruction after this one */
+    void append(SelectionInstruction &insn);
+    /*! Does it read memory? */
+    bool isRead(void) const;
+    /*! Does it write memory? */
+    bool isWrite(void) const;
+    /*! Is it a branch instruction (i.e. modify control flow) */
+    bool isBranch(void) const;
+    /*! Is it a label instruction (i.e. change the implicit mask) */
+    bool isLabel(void) const;
+    /*! Get the destination register */
+    GenRegister &dst(uint32_t dstID) { return regs[dstID]; }
+    /*! Get the source register */
+    GenRegister &src(uint32_t srcID) { return regs[dstNum+srcID]; }
+    /*! Damn C++ */
+    const GenRegister &dst(uint32_t dstID) const { return regs[dstID]; }
+    /*! Damn C++ */
+    const GenRegister &src(uint32_t srcID) const { return regs[dstNum+srcID]; }
+    /*! No more than 9 sources (used by typed writes on simd8 mode.) */
+    enum { MAX_SRC_NUM = 9 };
+    /*! No more than 16 destinations (15 used by I64DIV/I64REM) */
+    enum { MAX_DST_NUM = 16 };
+    /*! State of the instruction (extra fields neeed for the encoding) */
+    GenInstructionState state;
+    union {
+      struct {
+        /*! Store bti for loads/stores and function for math, atomic and compares */
+        uint16_t function:8;
+        /*! elemSize for byte scatters / gathers, elemNum for untyped msg, bti for atomic */
+        uint16_t elem:8;
+      };
+      struct {
+        /*! Number of sources in the tuple */
+        uint16_t width:4;
+        /*! vertical stride (0,1,2,4,8 or 16) */
+        uint16_t vstride:5;
+        /*! horizontal stride (0,1,2,4,8 or 16) */
+        uint16_t hstride:5;
+        /*! offset (0 to 7) */
+        uint16_t offset:5;
+      };
+      struct {
+        uint16_t scratchOffset;
+        uint16_t scratchMsgHeader;
+      };
+      struct {
+        uint16_t bti:8;
+        uint16_t msglen:5;
+        uint16_t is3DWrite:1;
+      };
+      struct {
+        uint16_t rdbti:8;
+        uint16_t sampler:5;
+        uint16_t rdmsglen:3;
+        bool     isLD;  // is this a ld message?
+        bool     isUniform;
+      };
+      uint32_t barrierType;
+      bool longjmp;
+    } extra;
+    /*! Gen opcode */
+    uint8_t opcode;
+    /*! Number of destinations */
+    uint8_t dstNum:5;
+    /*! Number of sources */
+    uint8_t srcNum:4;
+    /*! To store various indices */
+    uint16_t index;
+    /*! For BRC/IF to store the UIP */
+    uint16_t index1;
+    /*! instruction ID used for vector allocation. */
+    uint32_t ID;
+    /*! Variable sized. Destinations and sources go here */
+    GenRegister regs[0];
+    INLINE uint32_t getbti() const {
+      GBE_ASSERT(isRead() || isWrite());
+      switch (opcode) {
+        case SEL_OP_ATOMIC: return extra.elem;
+        case SEL_OP_BYTE_SCATTER:
+        case SEL_OP_WRITE64:
+        case SEL_OP_DWORD_GATHER:
+        case SEL_OP_UNTYPED_WRITE:
+        case SEL_OP_UNTYPED_READ:
+        case SEL_OP_BYTE_GATHER:
+        case SEL_OP_READ64: return extra.function;
+        case SEL_OP_SAMPLE: return extra.rdbti;
+        case SEL_OP_TYPED_WRITE: return extra.bti;
+        default:
+          GBE_ASSERT(0);
+      }
+      return 0;
+    }
+  private:
+    INLINE void setbti(uint32_t bti) {
+      GBE_ASSERT(isRead() || isWrite());
+      switch (opcode) {
+        case SEL_OP_ATOMIC: extra.elem = bti; return;
+        case SEL_OP_BYTE_SCATTER:
+        case SEL_OP_WRITE64:
+        case SEL_OP_UNTYPED_WRITE:
+        case SEL_OP_DWORD_GATHER:
+        case SEL_OP_UNTYPED_READ:
+        case SEL_OP_BYTE_GATHER:
+        case SEL_OP_READ64: extra.function = bti; return;
+        case SEL_OP_SAMPLE: extra.rdbti = bti; return;
+        case SEL_OP_TYPED_WRITE: extra.bti = bti; return;
+        default:
+          GBE_ASSERT(0);
+      }
+    }
+    /*! Just Selection class can create SelectionInstruction */
+    SelectionInstruction(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
+    // Allocates (with a linear allocator) and owns SelectionInstruction
+    friend class Selection;
+  };
+
+  /*! Instructions like sends require to make registers contiguous in GRF */
+  class SelectionVector : public NonCopyable, public intrusive_list_node
+  {
+  public:
+    SelectionVector(void);
+    /*! The instruction that requires the vector of registers */
+    SelectionInstruction *insn;
+    /*! Directly points to the selection instruction registers */
+    GenRegister *reg;
+    /*! Number of registers in the vector */
+    uint16_t regNum;
+    /*! Indicate if this a destination or a source vector */
+    uint16_t isSrc;
+  };
+
+  // Owns the selection block
+  class Selection;
+
+  /*! A selection block is the counterpart of the IR Basic block. It contains
+   *  the instructions generated from an IR basic block
+   */
+  class SelectionBlock : public NonCopyable, public intrusive_list_node
+  {
+  public:
+    SelectionBlock(const ir::BasicBlock *bb);
+    /*! All the emitted instructions in the block */
+    intrusive_list<SelectionInstruction> insnList;
+    /*! The vectors that may be required by some instructions of the block */
+    intrusive_list<SelectionVector> vectorList;
+    /*! Extra registers needed by the block (only live in the block) */
+    gbe::vector<ir::Register> tmp;
+    /*! Associated IR basic block */
+    const ir::BasicBlock *bb;
+    /*! Append a new temporary register */
+    void append(ir::Register reg);
+    /*! Append a new selection vector in the block */
+    void append(SelectionVector *vec);
+    /*! Append a new selection instruction at the end of the block */
+    void append(SelectionInstruction *insn);
+    /*! Append a new selection instruction at the beginning of the block */
+    void prepend(SelectionInstruction *insn);
+    bool isLargeBlock;
+    ir::LabelIndex endifLabel;
+    int endifOffset;
+    bool hasBarrier;
+    bool hasBranch;
+  };
+
+  /*! Owns the selection engine */
+  class GenContext;
+  /*! Selection engine produces the pre-ISA instruction blocks */
+  class Selection
+  {
+  public:
+    /*! Initialize internal structures used for the selection */
+    Selection(GenContext &ctx);
+    /*! Release everything */
+    ~Selection(void);
+    /*! Implements the instruction selection itself */
+    void select(void);
+    /*! Get the number of instructions of the largest block */
+    uint32_t getLargestBlockSize(void) const;
+    /*! Number of register vectors in the selection */
+    uint32_t getVectorNum(void) const;
+    /*! Number of registers (temporaries are created during selection) */
+    uint32_t getRegNum(void) const;
+    /*! Get the family for the given register */
+    ir::RegisterFamily getRegisterFamily(ir::Register reg) const;
+    /*! Get the data for the given register */
+    ir::RegisterData getRegisterData(ir::Register reg) const;
+    /*! Replace a source by the returned temporary register */
+    ir::Register replaceSrc(SelectionInstruction *insn, uint32_t regID, ir::Type type = ir::TYPE_FLOAT, bool needMov = true);
+    /*! Replace a destination to the returned temporary register */
+    ir::Register replaceDst(SelectionInstruction *insn, uint32_t regID, ir::Type type = ir::TYPE_FLOAT, bool needMov = true);
+    /*! spill a register (insert spill/unspill instructions) */
+    bool spillRegs(const SpilledRegs &spilledRegs, uint32_t registerPool);
+    /*! Indicate if a register is scalar or not */
+    bool isScalarReg(const ir::Register &reg) const;
+    /*! Create a new selection instruction */
+    SelectionInstruction *create(SelectionOpcode, uint32_t dstNum, uint32_t srcNum);
+    /*! List of emitted blocks */
+    intrusive_list<SelectionBlock> *blockList;
+    /*! Actual implementation of the register allocator (use Pimpl) */
+    class Opaque;
+    /*! Created and destroyed in cpp */
+    Opaque *opaque;
+    /*! Use custom allocators */
+    GBE_CLASS(Selection);
+  };
+
+  class Selection75: public Selection
+  {
+    public:
+      /*! Initialize internal structures used for the selection */
+      Selection75(GenContext &ctx);
+  };
+
+} /* namespace gbe */
+
+#endif /*  __GEN_INSN_SELECTION_HPP__ */
+
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
new file mode 100644
index 0000000..ddc9d5e
--- /dev/null
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -0,0 +1,86 @@
+DECL_SELECTION_IR(LABEL, LabelInstruction)
+DECL_SELECTION_IR(MOV, UnaryInstruction)
+DECL_SELECTION_IR(MOV_DF, UnaryWithTempInstruction)
+DECL_SELECTION_IR(LOAD_DF_IMM, UnaryWithTempInstruction)
+DECL_SELECTION_IR(LOAD_INT64_IMM, UnaryInstruction)
+DECL_SELECTION_IR(NOT, UnaryInstruction)
+DECL_SELECTION_IR(LZD, UnaryInstruction)
+DECL_SELECTION_IR(RNDZ, UnaryInstruction)
+DECL_SELECTION_IR(RNDE, UnaryInstruction)
+DECL_SELECTION_IR(RNDD, UnaryInstruction)
+DECL_SELECTION_IR(RNDU, UnaryInstruction)
+DECL_SELECTION_IR(FRC, UnaryInstruction)
+DECL_SELECTION_IR(F16TO32, UnaryInstruction)
+DECL_SELECTION_IR(F32TO16, UnaryInstruction)
+DECL_SELECTION_IR(SEL, BinaryInstruction)
+DECL_SELECTION_IR(SEL_INT64, BinaryInstruction)
+DECL_SELECTION_IR(AND, BinaryInstruction)
+DECL_SELECTION_IR(OR, BinaryInstruction)
+DECL_SELECTION_IR(XOR, BinaryInstruction)
+DECL_SELECTION_IR(I64AND, BinaryInstruction)
+DECL_SELECTION_IR(I64OR, BinaryInstruction)
+DECL_SELECTION_IR(I64XOR, BinaryInstruction)
+DECL_SELECTION_IR(SHR, BinaryInstruction)
+DECL_SELECTION_IR(SHL, BinaryInstruction)
+DECL_SELECTION_IR(RSR, BinaryInstruction)
+DECL_SELECTION_IR(RSL, BinaryInstruction)
+DECL_SELECTION_IR(ASR, BinaryInstruction)
+DECL_SELECTION_IR(I64SHR, I64ShiftInstruction)
+DECL_SELECTION_IR(I64SHL, I64ShiftInstruction)
+DECL_SELECTION_IR(I64ASR, I64ShiftInstruction)
+DECL_SELECTION_IR(ADD, BinaryInstruction)
+DECL_SELECTION_IR(I64ADD, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64SATADD, I64SATADDInstruction)
+DECL_SELECTION_IR(I64SUB, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64SATSUB, I64SATSUBInstruction)
+DECL_SELECTION_IR(MUL, BinaryInstruction)
+DECL_SELECTION_IR(I64MUL, I64MULInstruction)
+DECL_SELECTION_IR(I64DIV, I64DIVREMInstruction)
+DECL_SELECTION_IR(I64REM, I64DIVREMInstruction)
+DECL_SELECTION_IR(ATOMIC, AtomicInstruction)
+DECL_SELECTION_IR(MACH, BinaryInstruction)
+DECL_SELECTION_IR(CMP, CompareInstruction)
+DECL_SELECTION_IR(I64CMP, I64CompareInstruction)
+DECL_SELECTION_IR(SEL_CMP, CompareInstruction)
+DECL_SELECTION_IR(MAD, TernaryInstruction)
+DECL_SELECTION_IR(JMPI, JumpInstruction)
+DECL_SELECTION_IR(EOT, EotInstruction)
+DECL_SELECTION_IR(INDIRECT_MOVE, IndirectMoveInstruction)
+DECL_SELECTION_IR(NOP, NoOpInstruction)
+DECL_SELECTION_IR(WAIT, WaitInstruction)
+DECL_SELECTION_IR(MATH, MathInstruction)
+DECL_SELECTION_IR(BARRIER, BarrierInstruction)
+DECL_SELECTION_IR(FENCE, FenceInstruction)
+DECL_SELECTION_IR(UNTYPED_READ, UntypedReadInstruction)
+DECL_SELECTION_IR(UNTYPED_WRITE, UntypedWriteInstruction)
+DECL_SELECTION_IR(READ64, Read64Instruction)
+DECL_SELECTION_IR(WRITE64, Write64Instruction)
+DECL_SELECTION_IR(BYTE_GATHER, ByteGatherInstruction)
+DECL_SELECTION_IR(BYTE_SCATTER, ByteScatterInstruction)
+DECL_SELECTION_IR(DWORD_GATHER, DWordGatherInstruction)
+DECL_SELECTION_IR(PACK_BYTE, PackByteInstruction)
+DECL_SELECTION_IR(UNPACK_BYTE, UnpackByteInstruction)
+DECL_SELECTION_IR(SAMPLE, SampleInstruction)
+DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
+DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
+DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction)
+DECL_SELECTION_IR(MUL_HI, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64_MUL_HI, I64MULHIInstruction)
+DECL_SELECTION_IR(FBH, UnaryInstruction)
+DECL_SELECTION_IR(FBL, UnaryInstruction)
+DECL_SELECTION_IR(HADD, BinaryWithTempInstruction)
+DECL_SELECTION_IR(RHADD, BinaryWithTempInstruction)
+DECL_SELECTION_IR(I64HADD, I64HADDInstruction)
+DECL_SELECTION_IR(I64RHADD, I64RHADDInstruction)
+DECL_SELECTION_IR(UPSAMPLE_SHORT, BinaryInstruction)
+DECL_SELECTION_IR(UPSAMPLE_INT, BinaryInstruction)
+DECL_SELECTION_IR(UPSAMPLE_LONG, BinaryInstruction)
+DECL_SELECTION_IR(CONVI_TO_I64, UnaryWithTempInstruction)
+DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
+DECL_SELECTION_IR(CONVI64_TO_F, I64ToFloatInstruction)
+DECL_SELECTION_IR(CONVF_TO_I64, FloatToI64Instruction)
+DECL_SELECTION_IR(I64MADSAT, I64MADSATInstruction)
+DECL_SELECTION_IR(BRC, UnaryInstruction)
+DECL_SELECTION_IR(BRD, UnaryInstruction)
+DECL_SELECTION_IR(IF, UnaryInstruction)
+DECL_SELECTION_IR(ENDIF, UnaryInstruction)
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
new file mode 100644
index 0000000..5324587
--- /dev/null
+++ b/backend/src/backend/gen_program.cpp
@@ -0,0 +1,444 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file program.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/DataLayout.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/Linker/Linker.h"
+#else
+#include "llvm/Linker.h"
+#endif
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/IRReader/IRReader.h"
+
+#include "backend/program.h"
+#include "backend/gen_program.h"
+#include "backend/gen_program.hpp"
+#include "backend/gen_context.hpp"
+#include "backend/gen75_context.hpp"
+#include "backend/gen_defs.hpp"
+#include "backend/gen/gen_mesa_disasm.h"
+#include "backend/gen_reg_allocation.hpp"
+#include "ir/unit.hpp"
+#include "llvm/llvm_to_gen.hpp"
+#include "llvm/llvm_gen_backend.hpp"
+
+#include <clang/CodeGen/CodeGenAction.h>
+
+#include <cstring>
+#include <sstream>
+#include <memory>
+#include <iostream>
+#include <fstream>
+#include <mutex>
+#include <unistd.h>
+
+namespace gbe {
+
+  GenKernel::GenKernel(const std::string &name, uint32_t deviceID) :
+    Kernel(name), deviceID(deviceID), insns(NULL), insnNum(0)
+  {}
+  GenKernel::~GenKernel(void) { GBE_SAFE_DELETE_ARRAY(insns); }
+  const char *GenKernel::getCode(void) const { return (const char*) insns; }
+  const void GenKernel::setCode(const char * ins, size_t size) {
+    insns = (GenInstruction *)ins;
+    insnNum = size / sizeof(GenInstruction);
+  }
+  size_t GenKernel::getCodeSize(void) const { return insnNum * sizeof(GenInstruction); }
+
+  void GenKernel::printStatus(int indent, std::ostream& outs) {
+#ifdef GBE_COMPILER_AVAILABLE
+    Kernel::printStatus(indent, outs);
+
+    FILE *f = fopen("/dev/null", "w");
+    char *buf = new char[4096];
+    setbuffer(f, buf, 4096);
+    GenCompactInstruction * pCom = NULL;
+    GenNativeInstruction nativeInsn;
+
+    for (uint32_t i = 0; i < insnNum;) {
+      pCom = (GenCompactInstruction*)(insns+i);
+      if(pCom->bits1.cmpt_control == 1) {
+        decompactInstruction(pCom, &nativeInsn);
+        gen_disasm(f, &nativeInsn, deviceID, 1);
+        i++;
+      } else {
+        gen_disasm(f, insns+i, deviceID, 0);
+        i = i + 2;
+      }
+      outs << buf;
+      fflush(f);
+      setbuffer(f, NULL, 0);
+      setbuffer(f, buf, 4096);
+    }
+
+    setbuffer(f, NULL, 0);
+    delete [] buf;
+    fclose(f);
+#endif
+  }
+
+  void GenProgram::CleanLlvmResource(void){
+#ifdef GBE_COMPILER_AVAILABLE
+    if(module){
+      delete (llvm::Module*)module;
+      module = NULL;
+    }
+
+    if(llvm_ctx){
+      delete (llvm::LLVMContext*)llvm_ctx;
+      llvm_ctx = NULL;
+    }
+#endif
+  }
+
+  /*! We must avoid spilling at all cost with Gen */
+  static const struct CodeGenStrategy {
+    uint32_t simdWidth;
+    uint32_t reservedSpillRegs;
+    bool limitRegisterPressure;
+  } codeGenStrategy[] = {
+    {16, 0, false},
+    {16, 10, false},
+    {8, 0, false},
+    {8, 8, false},
+    {8, 16, false},
+  };
+
+  Kernel *GenProgram::compileKernel(const ir::Unit &unit, const std::string &name, bool relaxMath) {
+#ifdef GBE_COMPILER_AVAILABLE
+    // Be careful when the simdWidth is forced by the programmer. We can see it
+    // when the function already provides the simd width we need to use (i.e.
+    // non zero)
+    const ir::Function *fn = unit.getFunction(name);
+    uint32_t codeGenNum = sizeof(codeGenStrategy) / sizeof(codeGenStrategy[0]);
+    uint32_t codeGen = 0;
+    GenContext *ctx = NULL;
+    if (fn->getSimdWidth() == 8) {
+      codeGen = 2;
+    } else if (fn->getSimdWidth() == 16) {
+      codeGenNum = 2;
+    } else if (fn->getSimdWidth() == 0) {
+      codeGen = 0;
+    } else
+      GBE_ASSERT(0);
+    Kernel *kernel = NULL;
+
+    // Stop when compilation is successful
+    if (IS_IVYBRIDGE(deviceID)) {
+      ctx = GBE_NEW(GenContext, unit, name, deviceID, relaxMath);
+    } else if (IS_HASWELL(deviceID)) {
+      ctx = GBE_NEW(Gen75Context, unit, name, deviceID, relaxMath);
+    }
+    GBE_ASSERTM(ctx != NULL, "Fail to create the gen context\n");
+
+    for (; codeGen < codeGenNum; ++codeGen) {
+      const uint32_t simdWidth = codeGenStrategy[codeGen].simdWidth;
+      const bool limitRegisterPressure = codeGenStrategy[codeGen].limitRegisterPressure;
+      const uint32_t reservedSpillRegs = codeGenStrategy[codeGen].reservedSpillRegs;
+
+      // Force the SIMD width now and try to compile
+      unit.getFunction(name)->setSimdWidth(simdWidth);
+      ctx->startNewCG(simdWidth, reservedSpillRegs, limitRegisterPressure);
+      kernel = ctx->compileKernel();
+      if (kernel != NULL) {
+        GBE_ASSERT(ctx->getErrCode() == NO_ERROR);
+        break;
+      }
+      fn->getImageSet()->clearInfo();
+      // If we get a out of range if/endif error.
+      // We need to set the context to if endif fix mode and restart the previous compile.
+      if ( ctx->getErrCode() == OUT_OF_RANGE_IF_ENDIF && !ctx->getIFENDIFFix() ) {
+        ctx->setIFENDIFFix(true);
+        codeGen--;
+      } else
+        GBE_ASSERT(!(ctx->getErrCode() == OUT_OF_RANGE_IF_ENDIF && ctx->getIFENDIFFix()));
+    }
+
+    GBE_ASSERTM(kernel != NULL, "Fail to compile kernel, may need to increase reserved registers for spilling.");
+    return kernel;
+#else
+    return NULL;
+#endif
+  }
+
+#define BINARY_HEADER_LENGTH 8
+#define IS_GEN_BINARY(binary) (*binary == '\0' && *(binary+1) == 'G'&& *(binary+2) == 'E' &&*(binary+3) == 'N' &&*(binary+4) == 'C')
+#define FILL_GEN_BINARY(binary) do{*binary = '\0'; *(binary+1) = 'G'; *(binary+2) = 'E'; *(binary+3) = 'N'; *(binary+4) = 'C';}while(0)
+#define FILL_DEVICE_ID(binary, src_hw_info) do {*(binary+5) = src_hw_info[0]; *(binary+6) = src_hw_info[1]; *(binary+7) = src_hw_info[2];}while(0)
+#define DEVICE_MATCH(typeA, src_hw_info) ((IS_IVYBRIDGE(typeA) && !strcmp(src_hw_info, "IVB")) ||  \
+                                      (IS_IVYBRIDGE(typeA) && !strcmp(src_hw_info, "BYT")) ||  \
+                                      (IS_BAYTRAIL_T(typeA) && !strcmp(src_hw_info, "BYT")) ||  \
+                                      (IS_HASWELL(typeA) && !strcmp(src_hw_info, "HSW")) )
+
+  static gbe_program genProgramNewFromBinary(uint32_t deviceID, const char *binary, size_t size) {
+    using namespace gbe;
+    std::string binary_content;
+    //the header length is 8 bytes: 1 byte is binary type, 4 bytes are bitcode header, 3  bytes are hw info.
+    char src_hw_info[4]="";
+    src_hw_info[0] = *(binary+5);
+    src_hw_info[1] = *(binary+6);
+    src_hw_info[2] = *(binary+7);
+
+    // check whether is gen binary ('/0GENC')
+    if(!IS_GEN_BINARY(binary)){
+        return NULL;
+    }
+    // check the whether the current device ID match the binary file's.
+    if(!DEVICE_MATCH(deviceID, src_hw_info)){
+      return NULL;
+    }
+
+    binary_content.assign(binary+BINARY_HEADER_LENGTH, size-BINARY_HEADER_LENGTH);
+    GenProgram *program = GBE_NEW(GenProgram, deviceID);
+    std::istringstream ifs(binary_content, std::ostringstream::binary);
+
+    if (!program->deserializeFromBin(ifs)) {
+      delete program;
+      return NULL;
+    }
+
+    //program->printStatus(0, std::cout);
+    return reinterpret_cast<gbe_program>(program);
+  }
+
+  static gbe_program genProgramNewFromLLVMBinary(uint32_t deviceID, const char *binary, size_t size) {
+#ifdef GBE_COMPILER_AVAILABLE
+    using namespace gbe;
+    std::string binary_content;
+    //the first byte stands for binary_type.
+    binary_content.assign(binary+1, size-1);
+    llvm::StringRef llvm_bin_str(binary_content);
+    llvm::LLVMContext& c = llvm::getGlobalContext();
+    llvm::SMDiagnostic Err;
+    llvm::MemoryBuffer* memory_buffer = llvm::MemoryBuffer::getMemBuffer(llvm_bin_str, "llvm_bin_str");
+    acquireLLVMContextLock();
+    llvm::Module* module = llvm::ParseIR(memory_buffer, Err, c);
+    releaseLLVMContextLock();
+    if(module == NULL){
+      GBE_ASSERT(0);
+    }
+
+    GenProgram *program = GBE_NEW(GenProgram, deviceID, module);
+
+    //program->printStatus(0, std::cout);
+    return reinterpret_cast<gbe_program>(program);
+#else
+      return NULL;
+#endif
+  }
+
+  static size_t genProgramSerializeToBinary(gbe_program program, char **binary, int binary_type) {
+    using namespace gbe;
+    size_t sz;
+    std::ostringstream oss;
+    GenProgram *prog = (GenProgram*)program;
+
+    //0 means GEN binary, 1 means LLVM bitcode compiled object, 2 means LLVM bitcode library
+    if(binary_type == 0){
+      if ((sz = prog->serializeToBin(oss)) == 0) {
+        *binary = NULL;
+        return 0;
+      }
+
+      //add header to differetiate from llvm bitcode binary.
+      //the header length is 8 bytes: 1 byte is binary type, 4 bytes are bitcode header, 3  bytes are hw info.
+      *binary = (char *)malloc(sizeof(char) * (sz+BINARY_HEADER_LENGTH) );
+      memset(*binary, 0, sizeof(char) * (sz+BINARY_HEADER_LENGTH) );
+      FILL_GEN_BINARY(*binary);
+      char src_hw_info[4]="";
+      if(IS_IVYBRIDGE(prog->deviceID)){
+        src_hw_info[0]='I';
+        src_hw_info[1]='V';
+        src_hw_info[2]='B';
+        if(IS_BAYTRAIL_T(prog->deviceID)){
+          src_hw_info[0]='B';
+          src_hw_info[1]='Y';
+          src_hw_info[2]='T';
+        }
+      }else if(IS_HASWELL(prog->deviceID)){
+        src_hw_info[0]='H';
+        src_hw_info[1]='S';
+        src_hw_info[2]='W';
+      }
+      FILL_DEVICE_ID(*binary, src_hw_info);
+      memcpy(*binary+BINARY_HEADER_LENGTH, oss.str().c_str(), sz*sizeof(char));
+      return sz+BINARY_HEADER_LENGTH;
+    }else{
+#ifdef GBE_COMPILER_AVAILABLE
+      std::string str;
+      llvm::raw_string_ostream OS(str);
+      llvm::WriteBitcodeToFile((llvm::Module*)prog->module, OS);
+      std::string& bin_str = OS.str();
+      int llsz = bin_str.size();
+      *binary = (char *)malloc(sizeof(char) * (llsz+1) );
+      *(*binary) = binary_type;
+      memcpy(*binary+1, bin_str.c_str(), llsz);
+      return llsz+1;
+#else
+      return 0;
+#endif
+    }
+  }
+
+  static gbe_program genProgramNewFromLLVM(uint32_t deviceID,
+                                           const char *fileName,
+                                           const void* module,
+                                           const void* llvm_ctx,
+                                           size_t stringSize,
+                                           char *err,
+                                           size_t *errSize,
+                                           int optLevel)
+  {
+    using namespace gbe;
+    GenProgram *program = GBE_NEW(GenProgram, deviceID, module, llvm_ctx);
+#ifdef GBE_COMPILER_AVAILABLE
+    std::string error;
+    // Try to compile the program
+    if (program->buildFromLLVMFile(fileName, module, error, optLevel) == false) {
+      if (err != NULL && errSize != NULL && stringSize > 0u) {
+        const size_t msgSize = std::min(error.size(), stringSize-1u);
+        std::memcpy(err, error.c_str(), msgSize);
+        *errSize = error.size();
+      }
+      GBE_DELETE(program);
+      return NULL;
+    }
+#endif
+    // Everything run fine
+    return (gbe_program) program;
+  }
+
+  static gbe_program genProgramNewGenProgram(uint32_t deviceID, const void* module,
+                                             const void* llvm_ctx)  {
+    using namespace gbe;
+    GenProgram *program = GBE_NEW(GenProgram, deviceID, module, llvm_ctx);
+    // Everything run fine
+    return (gbe_program) program;
+  }
+
+  static void genProgramLinkFromLLVM(gbe_program           dst_program,
+                                     gbe_program           src_program,
+                                     size_t                stringSize,
+                                     char *                err,
+                                     size_t *              errSize)
+  {
+#ifdef GBE_COMPILER_AVAILABLE
+    using namespace gbe;
+    std::string errMsg;
+    if(((GenProgram*)dst_program)->module == NULL){
+      ((GenProgram*)dst_program)->module = llvm::CloneModule((llvm::Module*)((GenProgram*)src_program)->module);
+      errSize = 0;
+    }else{
+      //set the global variables and functions to link once to fix redefine.
+      llvm::Module* src = (llvm::Module*)((GenProgram*)src_program)->module;
+      for (llvm::Module::global_iterator I = src->global_begin(), E = src->global_end(); I != E; ++I) {
+        I->setLinkage(llvm::GlobalValue::LinkOnceAnyLinkage);
+      }
+
+      for (llvm::Module::iterator I = src->begin(), E = src->end(); I != E; ++I) {
+        llvm::Function *F = llvm::dyn_cast<llvm::Function>(I);
+        if (F && isKernelFunction(*F)) continue;
+        I->setLinkage(llvm::GlobalValue::LinkOnceAnyLinkage);
+      }
+      llvm::Module* dst = (llvm::Module*)((GenProgram*)dst_program)->module;
+      llvm::Linker::LinkModules( dst,
+                                 src,
+                                 llvm::Linker::PreserveSource,
+                                 &errMsg);
+      if (errMsg.c_str() != NULL) {
+        if (err != NULL && errSize != NULL && stringSize > 0u) {
+          if(errMsg.length() < stringSize )
+            stringSize = errMsg.length();
+          strcpy(err, errMsg.c_str());
+          err[stringSize+1] = '\0';
+        }
+      }
+    }
+    // Everything run fine
+#endif
+  }
+
+  static void genProgramBuildFromLLVM(gbe_program program,
+                                      size_t stringSize,
+                                      char *err,
+                                      size_t *errSize,
+                                      const char *          options)
+  {
+#ifdef GBE_COMPILER_AVAILABLE
+    using namespace gbe;
+    std::string error;
+
+    int optLevel = 1;
+
+    if(options) {
+      char *p;
+      p = strstr(const_cast<char *>(options), "-cl-opt-disable");
+      if (p)
+        optLevel = 0;
+    }
+
+    GenProgram* p = (GenProgram*) program;
+    // Try to compile the program
+    acquireLLVMContextLock();
+    llvm::Module* module = (llvm::Module*)p->module;
+
+    if (p->buildFromLLVMFile(NULL, module, error, optLevel) == false) {
+      if (err != NULL && errSize != NULL && stringSize > 0u) {
+        const size_t msgSize = std::min(error.size(), stringSize-1u);
+        std::memcpy(err, error.c_str(), msgSize);
+        *errSize = error.size();
+      }
+      GBE_DELETE(p);
+    }
+    releaseLLVMContextLock();
+#endif
+  }
+
+} /* namespace gbe */
+
+void genSetupCallBacks(void)
+{
+  gbe_program_new_from_binary = gbe::genProgramNewFromBinary;
+  gbe_program_new_from_llvm_binary = gbe::genProgramNewFromLLVMBinary;
+  gbe_program_serialize_to_binary = gbe::genProgramSerializeToBinary;
+  gbe_program_new_from_llvm = gbe::genProgramNewFromLLVM;
+  gbe_program_new_gen_program = gbe::genProgramNewGenProgram;
+  gbe_program_link_from_llvm = gbe::genProgramLinkFromLLVM;
+  gbe_program_build_from_llvm = gbe::genProgramBuildFromLLVM;
+}
diff --git a/backend/src/backend/gen_program.h b/backend/src/backend/gen_program.h
new file mode 100644
index 0000000..8d37a70
--- /dev/null
+++ b/backend/src/backend/gen_program.h
@@ -0,0 +1,38 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file program.h
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * C-like interface for the gen kernels and programs
+ */
+
+#ifndef __GBE_GEN_PROGRAM_H__
+#define __GBE_GEN_PROGRAM_H__
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <semaphore.h>
+
+/*! This will make the compiler output Gen ISA code */
+extern void genSetupCallBacks(void);
+
+#endif /* __GBE_GEN_PROGRAM_H__ */
+
diff --git a/backend/src/backend/gen_program.hpp b/backend/src/backend/gen_program.hpp
new file mode 100644
index 0000000..1b5136e
--- /dev/null
+++ b/backend/src/backend/gen_program.hpp
@@ -0,0 +1,86 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file program.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_GEN_PROGRAM_HPP__
+#define __GBE_GEN_PROGRAM_HPP__
+
+#include "backend/program.h"
+#include "backend/program.hpp"
+#include "backend/gen_defs.hpp"
+
+// Gen ISA instruction
+struct GenInstruction;
+namespace gbe
+{
+  /*! Describe a compiled kernel */
+  class GenKernel : public Kernel
+  {
+  public:
+    /*! Create an empty kernel with the given name */
+    GenKernel(const std::string &name, uint32_t deviceID);
+    /*! Destroy it */
+    virtual ~GenKernel(void);
+    /*! Implements base class */
+    virtual const char *getCode(void) const;
+    /*! Set the instruction stream (to be implemented) */
+    virtual const void setCode(const char *, size_t size);
+    /*! Implements get the code size */
+    virtual size_t getCodeSize(void) const;
+    /*! Implements printStatus*/
+    virtual void printStatus(int indent, std::ostream& outs);
+    uint32_t deviceID;      //!< Current device ID
+    GenInstruction *insns; //!< Instruction stream
+    uint32_t insnNum;      //!< Number of instructions
+    GBE_CLASS(GenKernel);  //!< Use custom allocators
+  };
+
+  /*! Describe a compiled program */
+  class GenProgram : public Program
+  {
+  public:
+    /*! Create an empty program */
+    GenProgram(uint32_t deviceID, const void* mod = NULL, const void* ctx = NULL) : deviceID(deviceID),module((void*)mod), llvm_ctx((void*)ctx) {}
+    /*! Current device ID*/
+    uint32_t deviceID;
+    /*! Destroy the program */
+    virtual ~GenProgram(void) {};
+    /*! Clean LLVM resource */
+    virtual void CleanLlvmResource(void);
+    /*! Implements base class */
+    virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name, bool relaxMath);
+    /*! Allocate an empty kernel. */
+    virtual Kernel *allocateKernel(const std::string &name) {
+      return GBE_NEW(GenKernel, name, deviceID);
+    }
+    void* module;
+    void* llvm_ctx;
+    /*! Use custom allocators */
+    GBE_CLASS(GenProgram);
+  };
+  /*! decompact GEN ASM if it is in compacted format */
+  extern void decompactInstruction(union GenCompactInstruction *p, union GenNativeInstruction *pOut);
+} /* namespace gbe */
+
+#endif /* __GBE_GEN_PROGRAM_HPP__ */
+
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
new file mode 100644
index 0000000..b7fbc93
--- /dev/null
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -0,0 +1,1218 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_reg_allocation.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/profile.hpp"
+#include "ir/function.hpp"
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_reg_allocation.hpp"
+#include "backend/gen_register.hpp"
+#include "backend/program.hpp"
+#include "sys/exception.hpp"
+#include "sys/cvar.hpp"
+#include <algorithm>
+#include <climits>
+#include <iostream>
+#include <iomanip>
+
+
+namespace gbe
+{
+  /////////////////////////////////////////////////////////////////////////////
+  // Register allocator internal implementation
+  /////////////////////////////////////////////////////////////////////////////
+
+  /*! Provides the location of a register in a vector */
+  typedef std::pair<SelectionVector*, uint32_t> VectorLocation;
+  /*! Interval as used in linear scan allocator. Basically, stores the first and
+   *  the last instruction where the register is alive
+   */
+  struct GenRegInterval {
+    INLINE GenRegInterval(ir::Register reg) :
+      reg(reg), minID(INT_MAX), maxID(-INT_MAX) {}
+    ir::Register reg;     //!< (virtual) register of the interval
+    int32_t minID, maxID; //!< Starting and ending points
+  };
+
+  typedef struct GenRegIntervalKey {
+    GenRegIntervalKey(uint16_t reg, int32_t maxID) {
+      key = ((uint64_t)maxID << 16) | reg;
+    }
+    const ir::Register getReg() const {
+      return (ir::Register)(key & 0xFFFF);
+    }
+    const int32_t getMaxID() const {
+      return key >> 16;
+    }
+    uint64_t key;
+  } GenRegIntervalKey;
+
+  struct spillCmp {
+    bool operator () (const GenRegIntervalKey &lhs, const GenRegIntervalKey &rhs) const
+    { return lhs.key > rhs.key; }
+  };
+
+  typedef set <GenRegIntervalKey, spillCmp> SpillSet;
+
+  class SpillCandidateSet : public SpillSet
+  {
+  public:
+    std::set<GenRegIntervalKey, spillCmp>::iterator find(GenRegInterval interval) {
+      GenRegIntervalKey key(interval.reg, interval.maxID);
+      return SpillSet::find(key);
+    }
+    void insert(GenRegInterval interval) {
+      GenRegIntervalKey key(interval.reg, interval.maxID);
+      SpillSet::insert(key);
+    }
+    void erase(GenRegInterval interval) {
+      GenRegIntervalKey key(interval.reg, interval.maxID);
+      SpillSet::erase(key);
+    }
+  };
+
+  /*! Implements the register allocation */
+  class GenRegAllocator::Opaque
+  {
+  public:
+    /*! Initialize the register allocator */
+    Opaque(GenContext &ctx);
+    /*! Release all taken resources */
+    ~Opaque(void);
+    /*! Perform the register allocation. Return true if success */
+    bool allocate(Selection &selection);
+    /*! Return the Gen register from the selection register */
+    GenRegister genReg(const GenRegister &reg);
+    /*! Output the register allocation */
+    void outputAllocation(void);
+    INLINE void getRegAttrib(ir::Register reg, uint32_t &regSize, ir::RegisterFamily *regFamily = NULL) const {
+      // Note that byte vector registers use two bytes per byte (and can be
+      // interleaved)
+      static const size_t familyVectorSize[] = {2,2,2,4,8};
+      static const size_t familyScalarSize[] = {2,2,2,4,8};
+      using namespace ir;
+      const bool isScalar = ctx.sel->isScalarReg(reg);
+      const RegisterData regData = ctx.sel->getRegisterData(reg);
+      const RegisterFamily family = regData.family;
+      const uint32_t typeSize = isScalar ? familyScalarSize[family] : familyVectorSize[family];
+      regSize = isScalar ? typeSize : ctx.getSimdWidth() * typeSize;
+      if (regFamily != NULL)
+        *regFamily = family;
+    }
+  private:
+    /*! Expire one GRF interval. Return true if one was successfully expired */
+    bool expireGRF(const GenRegInterval &limit);
+    /*! Expire a flag register. Return true if one was successfully expired */
+    bool expireFlag(const GenRegInterval &limit);
+    /*! Allocate the virtual boolean (== flags) registers */
+    void allocateFlags(Selection &selection);
+    /*! validated flags which contains valid value in the physical flag register */
+    set<uint16_t> validatedFlags;
+    /*! validated temp flag register which indicate the flag 0,1 contains which virtual flag register. */
+    uint16_t validTempFlagReg;
+    /*! validate flag for the current flag user instruction */
+    void validateFlag(Selection &selection, SelectionInstruction &insn);
+    /*! Allocate the GRF registers */
+    bool allocateGRFs(Selection &selection);
+    /*! Create gen registers for all preallocated curbe registers. */
+    void allocatePayloadRegs(void);
+    /*! Create a Gen register from a register set in the payload */
+    void allocatePayloadReg(ir::Register, uint32_t offset, uint32_t subOffset = 0);
+    /*! Create the intervals for each register */
+    /*! Allocate the vectors detected in the instruction selection pass */
+    void allocateVector(Selection &selection);
+    /*! Allocate the given interval. Return true if success */
+    bool createGenReg(const GenRegInterval &interval);
+    /*! Indicate if the registers are already allocated in vectors */
+    bool isAllocated(const SelectionVector *vector) const;
+    /*! Reallocate registers if needed to make the registers in the vector
+     *  contigous in memory
+     */
+    void coalesce(Selection &selection, SelectionVector *vector);
+    /*! The context owns the register allocator */
+    GenContext &ctx;
+    /*! Map virtual registers to offset in the (physical) register file */
+    map<ir::Register, uint32_t> RA;
+    /*! Map offset to virtual registers. */
+    map<uint32_t, ir::Register> offsetReg;
+    /*! Provides the position of each register in a vector */
+    map<ir::Register, VectorLocation> vectorMap;
+    /*! All vectors used in the selection */
+    vector<SelectionVector*> vectors;
+    /*! The set of booleans that will go to GRF (cannot be kept into flags) */
+    set<ir::Register> grfBooleans;
+    /*! The set of booleans which be held in flags, don't need to allocate grf */
+    set<ir::Register> flagBooleans;
+    /*! All the register intervals */
+    vector<GenRegInterval> intervals;
+    /*! All the boolean register intervals on the corresponding BB*/
+    typedef map<ir::Register, GenRegInterval> RegIntervalMap;
+    set<SelectionBlock *> flag0ReservedBlocks;
+    map<SelectionBlock *, RegIntervalMap *> boolIntervalsMap;
+    /*! Intervals sorting based on starting point positions */
+    vector<GenRegInterval*> starting;
+    /*! Intervals sorting based on ending point positions */
+    vector<GenRegInterval*> ending;
+    /*! registers that are spilled */
+    SpilledRegs spilledRegs;
+    /*! register which could be spilled.*/
+    SpillCandidateSet spillCandidate;
+    /* reserved registers for register spill/reload */
+    uint32_t reservedReg;
+    /*! Current vector to expire */
+    uint32_t expiringID;
+    INLINE void insertNewReg(ir::Register reg, uint32_t grfOffset, bool isVector = false);
+    INLINE bool expireReg(ir::Register reg);
+    INLINE bool spillAtInterval(GenRegInterval interval, int size, uint32_t alignment);
+    INLINE uint32_t allocateReg(GenRegInterval interval, uint32_t size, uint32_t alignment);
+    INLINE bool spillReg(GenRegInterval interval, bool isAllocated = false);
+    INLINE bool spillReg(ir::Register reg, bool isAllocated = false);
+    INLINE bool vectorCanSpill(SelectionVector *vector);
+    INLINE void allocateScratchForSpilled();
+
+    /*! replace specified source/dst register with temporary register and update interval */
+    INLINE ir::Register replaceReg(Selection &sel, SelectionInstruction *insn,
+                                   uint32_t regID, bool isSrc,
+                                   ir::Type type = ir::TYPE_FLOAT, bool needMov = true) {
+      ir::Register reg;
+      if (isSrc)
+        reg = sel.replaceSrc(insn, regID, type, needMov);
+      else
+        reg = sel.replaceDst(insn, regID, type, needMov);
+      intervals.push_back(reg);
+      intervals[reg].minID = insn->ID;
+      intervals[reg].maxID = insn->ID;
+      return reg;
+    }
+    /*! Use custom allocator */
+    GBE_CLASS(Opaque);
+  };
+
+
+  GenRegAllocator::Opaque::Opaque(GenContext &ctx) : ctx(ctx) {}
+  GenRegAllocator::Opaque::~Opaque(void) {}
+
+  void GenRegAllocator::Opaque::allocatePayloadReg(ir::Register reg,
+                                                   uint32_t offset,
+                                                   uint32_t subOffset)
+  {
+    using namespace ir;
+    assert(offset >= GEN_REG_SIZE);
+    offset += subOffset;
+    RA.insert(std::make_pair(reg, offset));
+    GBE_ASSERT(reg != ocl::blockip || (offset % GEN_REG_SIZE == 0));
+    this->intervals[reg].minID = 0;
+    this->intervals[reg].maxID = 0;
+  }
+
+  INLINE void GenRegAllocator::Opaque::allocatePayloadRegs(void) {
+    using namespace ir;
+    for(auto &it : this->ctx.curbeRegs)
+      allocatePayloadReg(it.first, it.second);
+
+    // Allocate all pushed registers (i.e. structure kernel arguments)
+    const Function &fn = ctx.getFunction();
+    GBE_ASSERT(fn.getProfile() == PROFILE_OCL);
+    const Function::PushMap &pushMap = fn.getPushMap();
+    for (auto rit = pushMap.rbegin(); rit != pushMap.rend(); ++rit) {
+      const uint32_t argID = rit->second.argID;
+      const FunctionArgument arg = fn.getArg(argID);
+
+      const uint32_t subOffset = rit->second.offset;
+      const Register reg = rit->second.getRegister();
+      auto it = this->ctx.curbeRegs.find(arg.reg);
+      assert(it != ctx.curbeRegs.end());
+      allocatePayloadReg(reg, it->second, subOffset);
+      ctx.splitBlock(it->second, subOffset);
+    }
+  }
+
+  bool GenRegAllocator::Opaque::createGenReg(const GenRegInterval &interval) {
+    using namespace ir;
+    const ir::Register reg = interval.reg;
+    if (RA.contains(reg) == true)
+      return true; // already allocated
+    uint32_t regSize;
+    ir::RegisterFamily family;
+    getRegAttrib(reg, regSize, &family);
+    uint32_t grfOffset = allocateReg(interval, regSize, regSize);
+    if (grfOffset == 0) {
+      return false;
+    }
+    insertNewReg(reg, grfOffset);
+    return true;
+  }
+
+  bool GenRegAllocator::Opaque::isAllocated(const SelectionVector *vector) const {
+    const ir::Register first = vector->reg[0].reg();
+    const auto it = vectorMap.find(first);
+
+    // If the first register is not allocated we are done
+    if (it == vectorMap.end())
+      return false;
+
+    // If there are more left registers than in the found vector, there are
+    // still registers to allocate
+    const SelectionVector *other = it->second.first;
+    const uint32_t otherFirst = it->second.second;
+    const uint32_t leftNum = other->regNum - otherFirst;
+    if (leftNum < vector->regNum)
+      return false;
+
+    // Now check that all the registers in the already allocated vector match
+    // the current vector
+    for (uint32_t regID = 1; regID < vector->regNum; ++regID) {
+       const ir::Register from = vector->reg[regID].reg();
+       const ir::Register to = other->reg[regID + otherFirst].reg();
+       if (from != to)
+         return false;
+    }
+    return true;
+  }
+
+  void GenRegAllocator::Opaque::coalesce(Selection &selection, SelectionVector *vector) {
+    for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
+      const ir::Register reg = vector->reg[regID].reg();
+      const auto it = this->vectorMap.find(reg);
+      // case 1: the register is not already in a vector, so it can stay in this
+      // vector. Note that local IDs are *non-scalar* special registers but will
+      // require a MOV anyway since pre-allocated in the CURBE
+      // If an element has very long interval, we don't want to put it into a
+      // vector as it will add more pressure to the register allocation.
+      if (it == vectorMap.end() &&
+          ctx.sel->isScalarReg(reg) == false &&
+          ctx.isSpecialReg(reg) == false &&
+          (intervals[reg].maxID - intervals[reg].minID) < 2048)
+      {
+        const VectorLocation location = std::make_pair(vector, regID);
+        this->vectorMap.insert(std::make_pair(reg, location));
+      }
+      // case 2: the register is already in another vector, so we need to move
+      // it to a temporary register.
+      // TODO: we can do better than that if we analyze the liveness of the
+      // already allocated registers in the vector.  If there is no inteference
+      // and the order is maintained, we can reuse the previous vector and avoid
+      // the MOVs
+      else {
+        ir::Register tmp;
+        ir::Type type = getIRType(vector->reg[regID].type);
+        tmp = this->replaceReg(selection, vector->insn, regID, vector->isSrc, type);
+        const VectorLocation location = std::make_pair(vector, regID);
+        this->vectorMap.insert(std::make_pair(tmp, location));
+      }
+    }
+  }
+
+  /*! Will sort vector in decreasing order */
+  inline bool cmp(const SelectionVector *v0, const SelectionVector *v1) {
+    return v0->regNum > v1->regNum;
+  }
+
+  void GenRegAllocator::Opaque::allocateVector(Selection &selection) {
+    const uint32_t vectorNum = selection.getVectorNum();
+    this->vectors.resize(vectorNum);
+
+    // First we find and store all vectors
+    uint32_t vectorID = 0;
+    for (auto &block : *selection.blockList)
+      for (auto &v : block.vectorList)
+        this->vectors[vectorID++] = &v;
+    GBE_ASSERT(vectorID == vectorNum);
+
+    // Heuristic (really simple...): sort them by the number of registers they
+    // contain
+    std::sort(this->vectors.begin(), this->vectors.end(), cmp);
+
+    // Insert MOVs when this is required
+    for (vectorID = 0; vectorID < vectorNum; ++vectorID) {
+      SelectionVector *vector = this->vectors[vectorID];
+      if (this->isAllocated(vector))
+        continue;
+      this->coalesce(selection, vector);
+    }
+  }
+
+  template <bool sortStartingPoint>
+  inline bool cmp(const GenRegInterval *i0, const GenRegInterval *i1) {
+    return sortStartingPoint ? i0->minID < i1->minID : i0->maxID < i1->maxID;
+  }
+
+  bool GenRegAllocator::Opaque::expireGRF(const GenRegInterval &limit) {
+    bool ret = false;
+    while (this->expiringID != ending.size()) {
+      const GenRegInterval *toExpire = this->ending[this->expiringID];
+      const ir::Register reg = toExpire->reg;
+
+      // Dead code produced by the insn selection -> we skip it
+      if (toExpire->minID > toExpire->maxID) {
+        this->expiringID++;
+        continue;
+      }
+
+      //ignore register that already spilled
+      if(spilledRegs.find(reg) != spilledRegs.end()) {
+        this->expiringID++;
+        continue;
+      }
+
+      if (toExpire->maxID >= limit.minID)
+        break;
+
+      if (expireReg(reg))
+        ret = true;
+      this->expiringID++;
+    }
+
+    // We were not able to expire anything
+    return ret;
+  }
+
+
+  #define IS_IMPLICITLY_MOD_FLAG(insn) (insn.state.modFlag == 1 &&      \
+                                         (insn.opcode == SEL_OP_MOV ||  \
+                                          insn.opcode == SEL_OP_AND  || \
+                                          insn.opcode == SEL_OP_OR  ||  \
+                                          insn.opcode == SEL_OP_XOR))
+
+  #define IS_SCALAR_FLAG(insn) selection.isScalarReg(ir::Register(insn.state.flagIndex))
+  #define GET_FLAG_REG(insn) GenRegister::uwxgrf(IS_SCALAR_FLAG(insn) ? 1 : 8,\
+                                                 ir::Register(insn.state.flagIndex));
+  #define IS_TEMP_FLAG(insn) (insn.state.flag == 0 && insn.state.subFlag == 1)
+  // Flag is a virtual flag, this function is to validate the virtual flag
+  // to a physical flag. It is used to validate both temporary flag and the
+  // non-temporary flag registers.
+  // We track the last temporary validate register, if it's the same as
+  // current, we can avoid the revalidation.
+  void GenRegAllocator::Opaque::validateFlag(Selection &selection,
+                                             SelectionInstruction &insn) {
+    GBE_ASSERT(insn.state.physicalFlag == 1);
+    if (!IS_TEMP_FLAG(insn) && validatedFlags.find(insn.state.flagIndex) != validatedFlags.end())
+      return;
+    else if (IS_TEMP_FLAG(insn) && validTempFlagReg == insn.state.flagIndex)
+      return;
+    SelectionInstruction *cmp0 = selection.create(SEL_OP_CMP, 1, 2);
+    cmp0->state = GenInstructionState(ctx.getSimdWidth());
+    cmp0->state.flag = insn.state.flag;
+    cmp0->state.subFlag = insn.state.subFlag;
+    if (IS_SCALAR_FLAG(insn))
+      cmp0->state.noMask = 1;
+    cmp0->src(0) = GET_FLAG_REG(insn);
+    cmp0->src(1) = GenRegister::immuw(0);
+    cmp0->dst(0) = GenRegister::retype(GenRegister::null(), GEN_TYPE_UW);
+    cmp0->extra.function = GEN_CONDITIONAL_NEQ;
+    insn.prepend(*cmp0);
+    if (!IS_TEMP_FLAG(insn))
+      validatedFlags.insert(insn.state.flagIndex);
+    else {
+      if (insn.state.modFlag == 0)
+        validTempFlagReg = insn.state.flagIndex;
+      else
+        validTempFlagReg = 0;
+    }
+  }
+
+  
+  void GenRegAllocator::Opaque::allocateFlags(Selection &selection) {
+    // Previously, we have a global flag allocation implemntation.
+    // After some analysis, I found the global flag allocation is not
+    // the best solution here.
+    // As for the cross block reference of bool value, we have to
+    // combine it with current emask. There is no obvious advantage to
+    // allocate deadicate physical flag register for those cross block usage.
+    // We just need to allocate physical flag within each BB. We need to handle
+    // the following cases:
+    //
+    // 1. The bool's liveness never beyond this BB. And the bool is only used as
+    //    a dst register or a pred register. This bool value could be
+    //    allocated in physical flag only if there is enough physical flag.
+    //    We already identified those bool at the instruction select stage, and
+    //    put them in the flagBooleans set.
+    // 2. The bool is defined in another BB and used in this BB, then we need
+    //    to prepend an instruction at the position where we use it.
+    // 3. The bool is defined in this BB but is also used as some instruction's
+    //    source registers rather than the pred register. We have to keep the normal
+    //    grf (UW8/UW16) register for this bool. For some CMP instruction, we need to
+    //    append a SEL instruction convert the flag to the grf register.
+    // 4. Even for the spilling flag, if there is only one spilling flag, we will also
+    //    try to reuse the temporary flag register latter. This requires all the
+    //    instructions should got it flag at the instruction selection stage. And should
+    //    not use the flag physical number directly at the gen_context stage. Otherwise,
+    //    may break the algorithm here.
+    // We will track all the validated bool value and to avoid any redundant
+    // validation for the same flag. But if there is no enough physical flag,
+    // we have to spill the previous allocated physical flag. And the spilling
+    // policy is to spill the allocate flag which live to the last time end point.
+
+    // we have three flags we use for booleans f0.0 , f1.0 and f1.1
+    for (auto &block : *selection.blockList) {
+      // Store the registers allocated in the map
+      map<ir::Register, uint32_t> allocatedFlags;
+      map<const GenRegInterval*, uint32_t> allocatedFlagIntervals;
+
+      const uint32_t flagNum = flag0ReservedBlocks.contains(&block) ?  2 : 3;
+      uint32_t freeFlags[] = {2, 3, 0};
+      uint32_t freeNum = flagNum;
+      if (boolIntervalsMap.find(&block) == boolIntervalsMap.end())
+        continue;
+      const auto boolsMap = boolIntervalsMap[&block];
+      vector<const GenRegInterval*> flagStarting;
+      vector<const GenRegInterval*> flagEnding;
+      GBE_ASSERT(boolsMap->size() > 0);
+      uint32_t regNum = boolsMap->size();
+      flagStarting.resize(regNum);
+      flagEnding.resize(regNum);
+      uint32_t id = 0;
+      for (auto &interval : *boolsMap) {
+        flagStarting[id] = flagEnding[id] = &interval.second;
+        id++;
+      }
+      std::sort(flagStarting.begin(), flagStarting.end(), cmp<true>);
+      std::sort(flagEnding.begin(), flagEnding.end(), cmp<false>);
+
+      uint32_t endID = 0; // interval to expire
+      for (uint32_t startID = 0; startID < regNum; ++startID) {
+        const GenRegInterval *interval = flagStarting[startID];
+        const ir::Register reg = interval->reg;
+        GBE_ASSERT(ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL);
+        if (freeNum != 0) {
+          allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
+          allocatedFlagIntervals.insert(std::make_pair(interval, freeFlags[freeNum]));
+        } else {
+        // Try to expire one register
+        while (endID != flagEnding.size()) {
+          const GenRegInterval *toExpire = flagEnding[endID];
+          // Dead code produced by the insn selection -> we skip it
+          if (toExpire->minID > toExpire->maxID) {
+            endID++;
+            continue;
+          }
+          // We cannot expire this interval and the next ones
+          if (toExpire->maxID >= interval->minID)
+            break;
+          // We reuse a flag from a previous interval (the oldest one)
+          auto it = allocatedFlags.find(toExpire->reg);
+          if (it == allocatedFlags.end()) {
+            endID++;
+            continue;
+          }
+          freeFlags[freeNum++] = it->second;
+          endID++;
+          break;
+        }
+        if (freeNum != 0) {
+          allocatedFlags.insert(std::make_pair(reg, freeFlags[--freeNum]));
+          allocatedFlagIntervals.insert(std::make_pair(interval, freeFlags[freeNum]));
+        }
+        else {
+          // FIXME we may sort the allocated flags before do the spilling in the furture.
+          int32_t spill = -1;
+          const GenRegInterval *spillInterval = NULL;
+          int32_t maxID = 0;
+          for (auto &it : allocatedFlagIntervals) {
+            if (it.first->maxID <= interval->minID)
+              continue;
+            if (it.first->maxID > maxID && it.second != 0) {
+              maxID = it.first->maxID;
+              spill = it.second;
+              spillInterval = it.first;
+            }
+          }
+          if (spill != -1) {
+            allocatedFlags.insert(std::make_pair(reg, spill));
+            allocatedFlagIntervals.insert(std::make_pair(interval, spill));
+            allocatedFlags.erase(spillInterval->reg);
+            allocatedFlagIntervals.erase(spillInterval);
+            // We spill this flag booleans register, so erase it from the flag boolean set.
+            if (flagBooleans.contains(spillInterval->reg))
+              flagBooleans.erase(spillInterval->reg);
+          } else {
+            GBE_ASSERT(0);
+          }
+        }
+        }
+      }
+      delete boolsMap;
+
+      // Now, we traverse all the selection instructions and we patch them to make
+      // them use flag registers
+      validTempFlagReg = 0;
+      validatedFlags.clear();
+      for (auto &insn : block.insnList) {
+        // Patch the predicate now. Note that only compares actually modify it (it
+        // is called a "conditional modifier"). The other instructions just read
+        // it
+        if (insn.state.physicalFlag == 0) {
+          auto it = allocatedFlags.find(ir::Register(insn.state.flagIndex));
+          if (it != allocatedFlags.end()) {
+            insn.state.physicalFlag = 1;
+            insn.state.flag = it->second / 2;
+            insn.state.subFlag = it->second & 1;
+
+            // modFlag is for the LOADI/MOV/AND/OR/XOR instructions which will modify a
+            // flag register. We set the condition for them to save one instruction if possible.
+            if (IS_IMPLICITLY_MOD_FLAG(insn)) {
+              // If this is a modFlag on a scalar bool, we need to remove it
+              // from the allocated flags map. Then latter, the user could
+              // validate the flag from the scalar value correctly.
+              if (IS_SCALAR_FLAG(insn)) {
+                allocatedFlags.erase(ir::Register(insn.state.flagIndex));
+                continue;
+              }
+              insn.extra.function = GEN_CONDITIONAL_NEQ;
+            }
+            // If this is an external bool, we need to validate it if it is not validated yet.
+            if ((insn.state.externFlag &&
+                 insn.state.predicate != GEN_PREDICATE_NONE))
+              validateFlag(selection, insn);
+          } else {
+            insn.state.physicalFlag = 1;
+            insn.state.flag = 0;
+            insn.state.subFlag = 1;
+
+            // If this is for MOV/AND/OR/... we don't need to waste an extra instruction
+            // to generate the flag here, just continue to next instruction. And the validTempFlagReg
+            // will not be destroyed.
+            if (IS_IMPLICITLY_MOD_FLAG(insn))
+              continue;
+            // This bool doesn't have a deadicated flag, we use temporary flag here.
+            // each time we need to validate it from the grf register.
+            if (insn.state.predicate != GEN_PREDICATE_NONE)
+              validateFlag(selection, insn);
+          }
+          // This is a CMP for a pure flag booleans, we don't need to write result to
+          // the grf. And latter, we will not allocate grf for it.
+          if (insn.opcode == SEL_OP_CMP &&
+              (flagBooleans.contains(insn.dst(0).reg()) ||
+               GenRegister::isNull(insn.dst(0)))) {
+            // set a temporary register to avoid switch in this block.
+            bool isSrc = false;
+            bool needMov = false;
+            this->replaceReg(selection, &insn, 0, isSrc, ir::TYPE_FLOAT, needMov);
+          }
+          // If the instruction requires to generate (CMP for long/int/float..)
+          // the flag value to the register, and it's not a pure flag boolean,
+          // we need to use SEL instruction to generate the flag value to the UW8
+          // register.
+          if (insn.state.flagGen == 1 &&
+              !flagBooleans.contains((ir::Register)(insn.state.flagIndex))) {
+            SelectionInstruction *sel0 = selection.create(SEL_OP_SEL, 1, 2);
+            uint32_t simdWidth;
+            simdWidth = IS_SCALAR_FLAG(insn) ? 1 : ctx.getSimdWidth();
+
+            sel0->state = GenInstructionState(simdWidth);
+            if (IS_SCALAR_FLAG(insn))
+              sel0->state.noMask = 1;
+            sel0->state.flag = insn.state.flag;
+            sel0->state.subFlag = insn.state.subFlag;
+            sel0->state.predicate = GEN_PREDICATE_NORMAL;
+            sel0->src(0) = GenRegister::uw1grf(ir::ocl::one);
+            sel0->src(1) = GenRegister::uw1grf(ir::ocl::zero);
+            sel0->dst(0) = GET_FLAG_REG(insn);
+            insn.append(*sel0);
+            // We use the zero one after the liveness analysis, we have to update
+            // the liveness data manually here.
+            GenRegInterval &interval0 = intervals[ir::ocl::zero];
+            GenRegInterval &interval1 = intervals[ir::ocl::one];
+            interval0.minID = std::min(interval0.minID, (int32_t)insn.ID);
+            interval0.maxID = std::max(interval0.maxID, (int32_t)insn.ID);
+            interval1.minID = std::min(interval1.minID, (int32_t)insn.ID);
+            interval1.maxID = std::max(interval1.maxID, (int32_t)insn.ID);
+          }
+        } else {
+          // If the instruction use the temporary flag register manually,
+          // we should invalidate the temp flag reg here.
+          if (insn.state.flag == 0 && insn.state.subFlag == 1)
+            validTempFlagReg = 0;
+        }
+      }
+    }
+  }
+
+  IVAR(OCL_SIMD16_SPILL_THRESHOLD, 0, 16, 256);
+  bool GenRegAllocator::Opaque::allocateGRFs(Selection &selection) {
+    // Perform the linear scan allocator
+    ctx.errCode = REGISTER_ALLOCATION_FAIL;
+    const uint32_t regNum = ctx.sel->getRegNum();
+    for (uint32_t startID = 0; startID < regNum; ++startID) {
+      const GenRegInterval &interval = *this->starting[startID];
+      const ir::Register reg = interval.reg;
+      if (interval.maxID == -INT_MAX)
+        continue; // Unused register
+      if (RA.contains(reg))
+        continue; // already allocated
+
+      if (flagBooleans.contains(reg))
+        continue;
+
+      // Case 1: the register belongs to a vector, allocate all the registers in
+      // one piece
+      auto it = vectorMap.find(reg);
+      if (it != vectorMap.end()) {
+        const SelectionVector *vector = it->second.first;
+        // all the reg in the SelectionVector are spilled
+        if(spilledRegs.find(vector->reg[0].reg())
+           != spilledRegs.end())
+          continue;
+
+        uint32_t alignment;
+        uint32_t size = 0;
+        for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
+          getRegAttrib(vector->reg[regID].reg(), alignment, NULL);
+          size += alignment;
+        }
+        // FIXME this is workaround for scheduling limitation, which requires 2*GEN_REG_SIZE under SIMD16.
+        const uint32_t maxAlignment = ctx.getSimdWidth()/8*GEN_REG_SIZE;
+        const uint32_t grfOffset = allocateReg(interval, size, maxAlignment);
+        if(grfOffset == 0) {
+          for(int i = vector->regNum-1; i >= 0; i--) {
+            if (!spillReg(vector->reg[i].reg()))
+              return false;
+          }
+          continue;
+        }
+        uint32_t subOffset = 0;
+        for (uint32_t regID = 0; regID < vector->regNum; ++regID) {
+          const ir::Register reg = vector->reg[regID].reg();
+          GBE_ASSERT(RA.contains(reg) == false);
+          getRegAttrib(reg, alignment, NULL);
+          // check all sub registers aligned correctly
+          GBE_ASSERT((grfOffset + subOffset) % alignment == 0 || (grfOffset + subOffset) % GEN_REG_SIZE == 0);
+          insertNewReg(reg, grfOffset + subOffset, true);
+          ctx.splitBlock(grfOffset, subOffset);  //splitBlock will not split if regID == 0
+          subOffset += alignment;
+        }
+      }
+      // Case 2: This is a regular scalar register, allocate it alone
+      else if (this->createGenReg(interval) == false) {
+        if (!spillReg(interval))
+          return false;
+      }
+    }
+    if (!spilledRegs.empty()) {
+      GBE_ASSERT(reservedReg != 0);
+      if (ctx.getSimdWidth() == 16) {
+        if (spilledRegs.size() > (unsigned int)OCL_SIMD16_SPILL_THRESHOLD) {
+          ctx.errCode = REGISTER_SPILL_EXCEED_THRESHOLD;
+          return false;
+        }
+      }
+      allocateScratchForSpilled();
+      bool success = selection.spillRegs(spilledRegs, reservedReg);
+      if (!success) {
+        ctx.errCode = REGISTER_SPILL_FAIL;
+        return false;
+      }
+    }
+    ctx.errCode = NO_ERROR;
+    return true;
+  }
+
+  INLINE void GenRegAllocator::Opaque::allocateScratchForSpilled()
+  {
+    const uint32_t regNum = spilledRegs.size();
+    this->starting.resize(regNum);
+    this->ending.resize(regNum);
+    uint32_t regID = 0;
+    for(auto it = spilledRegs.begin(); it != spilledRegs.end(); ++it) {
+      this->starting[regID] = this->ending[regID] = &intervals[it->first];
+      regID++;
+    }
+    std::sort(this->starting.begin(), this->starting.end(), cmp<true>);
+    std::sort(this->ending.begin(), this->ending.end(), cmp<false>);
+    int toExpire = 0;
+    for(uint32_t i = 0; i < regNum; i++) {
+      const GenRegInterval * cur = starting[i];
+      const GenRegInterval * exp = ending[toExpire];
+      if (exp->maxID < cur->minID) {
+        auto it = spilledRegs.find(exp->reg);
+        GBE_ASSERT(it != spilledRegs.end());
+        if(it->second.addr != -1) {
+          ctx.deallocateScratchMem(it->second.addr);
+        }
+        toExpire++;
+      }
+      auto it = spilledRegs.find(cur->reg);
+      GBE_ASSERT(it != spilledRegs.end());
+      if(cur->minID == cur->maxID) {
+        it->second.addr = -1;
+        continue;
+      }
+
+      ir::RegisterFamily family = ctx.sel->getRegisterFamily(cur->reg);
+      it->second.addr = ctx.allocateScratchMem(getFamilySize(family)
+                                             * ctx.getSimdWidth());
+      }
+  }
+
+  INLINE bool GenRegAllocator::Opaque::expireReg(ir::Register reg)
+  {
+    auto it = RA.find(reg);
+    if (flagBooleans.contains(reg))
+      return false;
+    GBE_ASSERT(it != RA.end());
+    // offset less than 32 means it is not managed by our reg allocator.
+    if (it->second < 32)
+      return false;
+
+    ctx.deallocate(it->second);
+    if (reservedReg != 0
+        && (spillCandidate.find(intervals[reg]) != spillCandidate.end())) {
+        spillCandidate.erase(intervals[reg]);
+        /* offset --> reg map should keep updated. */
+        offsetReg.erase(it->second);
+    }
+
+    return true;
+  }
+
+  // insert a new register with allocated offset,
+  // put it to the RA map and the spill map if it could be spilled.
+  INLINE void GenRegAllocator::Opaque::insertNewReg(ir::Register reg, uint32_t grfOffset, bool isVector)
+  {
+     RA.insert(std::make_pair(reg, grfOffset));
+
+     if (reservedReg != 0) {
+
+       uint32_t regSize;
+       ir::RegisterFamily family;
+       getRegAttrib(reg, regSize, &family);
+       // At simd16 mode, we may introduce some simd8 registers in te instruction selection stage.
+       // To spill those simd8 temporary registers will introduce unecessary complexity. We just simply
+       // avoid to spill those temporary registers here.
+       if (ctx.getSimdWidth() == 16 && reg.value() >= ctx.getFunction().getRegisterFile().regNum())
+         return;
+
+       if ((regSize == ctx.getSimdWidth()/8 * GEN_REG_SIZE && family == ir::FAMILY_DWORD)
+          || (regSize == 2 * ctx.getSimdWidth()/8 * GEN_REG_SIZE && family == ir::FAMILY_QWORD)) {
+         GBE_ASSERT(offsetReg.find(grfOffset) == offsetReg.end());
+         offsetReg.insert(std::make_pair(grfOffset, reg));
+         spillCandidate.insert(intervals[reg]);
+       }
+     }
+  }
+
+  INLINE bool GenRegAllocator::Opaque::spillReg(ir::Register reg,
+                                                bool isAllocated) {
+    return spillReg(intervals[reg], isAllocated);
+  }
+
+  INLINE bool GenRegAllocator::Opaque::spillReg(GenRegInterval interval,
+                                                bool isAllocated) {
+    if (reservedReg == 0)
+      return false;
+
+    if (interval.reg.value() >= ctx.getFunction().getRegisterFile().regNum() &&
+        ctx.getSimdWidth() == 16)
+      return false;
+
+    ir::RegisterFamily family = ctx.sel->getRegisterFamily(interval.reg);
+    // we currently only support DWORD/QWORD spill
+    if(family != ir::FAMILY_DWORD && family != ir::FAMILY_QWORD)
+      return false;
+
+    SpillRegTag spillTag;
+    spillTag.isTmpReg = interval.maxID == interval.minID;
+    spillTag.addr = -1;
+
+    if (isAllocated) {
+      // If this register is allocated, we need to expire it and erase it
+      // from the RA map.
+      bool success = expireReg(interval.reg);
+      GBE_ASSERT(success);
+      success = success;
+      RA.erase(interval.reg);
+    }
+    spilledRegs.insert(std::make_pair(interval.reg, spillTag));
+    return true;
+  }
+
+  // Check whethere a vector which is allocated can be spilled out
+  // If a partial of a vector is expired, the vector will be unspillable, currently.
+  // FIXME we may need to fix those unspillable vector in the furture.
+  INLINE bool GenRegAllocator::Opaque::vectorCanSpill(SelectionVector *vector) {
+    for(uint32_t id = 0; id < vector->regNum; id++)
+      if (spillCandidate.find(intervals[(ir::Register)(vector->reg[id].value.reg)])
+          == spillCandidate.end())
+        return false;
+    return true;
+  }
+
+  INLINE bool GenRegAllocator::Opaque::spillAtInterval(GenRegInterval interval,
+                                                       int size,
+                                                       uint32_t alignment) {
+    if (reservedReg == 0)
+      return false;
+    auto it = spillCandidate.begin();
+    // If there is no spill candidate or current register is spillable and current register's
+    // endpoint is after all the spillCandidate register's endpoint we return false. The
+    // caller will spill current register.
+    // At simd16 mode, we will always try to spill here rather than return to the caller.
+    // The reason is that the caller may have a vector to allocate, and some element may be
+    // temporary registers which could not be spilled.
+    if (it == spillCandidate.end()
+        || (ctx.getSimdWidth() == 8 && (it->getMaxID() <= interval.maxID
+            && alignment == ctx.getSimdWidth()/8 * GEN_REG_SIZE)))
+      return false;
+
+    ir::Register reg = it->getReg();
+    set<ir::Register> spillSet;
+    int32_t savedSize = size;
+    while(size > 0) {
+      auto vectorIt = vectorMap.find(reg);
+      bool isVector = vectorIt != vectorMap.end();
+      bool needRestart = false;
+      ir::RegisterFamily family = ctx.sel->getRegisterFamily(reg);
+      if (isVector
+          && (vectorCanSpill(vectorIt->second.first))) {
+        const SelectionVector *vector = vectorIt->second.first;
+        for (uint32_t id = 0; id < vector->regNum; id++) {
+          GBE_ASSERT(spilledRegs.find(vector->reg[id].reg())
+                     == spilledRegs.end());
+          spillSet.insert(vector->reg[id].reg());
+          reg = vector->reg[id].reg();
+          family = ctx.sel->getRegisterFamily(reg);
+          size -= family == ir::FAMILY_QWORD ? 2 * GEN_REG_SIZE * ctx.getSimdWidth()/8
+                                             : GEN_REG_SIZE * ctx.getSimdWidth()/8;
+        }
+      } else if (!isVector) {
+        spillSet.insert(reg);
+        size -= family == ir::FAMILY_QWORD ? 2 * GEN_REG_SIZE * ctx.getSimdWidth()/8
+                                           : GEN_REG_SIZE * ctx.getSimdWidth()/8;
+      } else
+        needRestart = true; // is a vector which could not be spilled.
+
+      if (size <= 0)
+        break;
+      if (!needRestart) {
+        uint32_t offset = RA.find(reg)->second;
+        uint32_t nextOffset = (family == ir::FAMILY_QWORD) ? (offset + 2 * GEN_REG_SIZE * ctx.getSimdWidth() / 8)
+                                                           : (offset + GEN_REG_SIZE * ctx.getSimdWidth() / 8);
+        auto nextRegIt = offsetReg.find(nextOffset);
+        if (nextRegIt != offsetReg.end())
+          reg = nextRegIt->second;
+        else
+          needRestart = true;
+      }
+
+      if (needRestart) {
+#if 0
+        // FIXME, we should enable this code block in the future.
+        // If the spill set is not zero and we need a restart, we can
+        // simply return to try to allocate the registers at first.
+        // As some vectors which have expired elements may be marked as
+        // unspillable vector.
+        if (spillSet.size() > 0)
+          break;
+#endif
+        it++;
+        // next register is not in spill candidate.
+        // let's move to next candidate and start over.
+        if (it == spillCandidate.end())
+          return false;
+        reg = it->getReg();
+        size = savedSize;
+        spillSet.clear();
+      }
+    }
+
+    for(auto spillreg : spillSet)
+      spillReg(spillreg, true);
+    return true;
+  }
+
+  INLINE uint32_t GenRegAllocator::Opaque::allocateReg(GenRegInterval interval,
+                                                       uint32_t size,
+                                                       uint32_t alignment) {
+    uint32_t grfOffset;
+    static uint32_t tick = 0;
+    // Doing expireGRF too freqently will cause the post register allocation
+    // scheduling very hard. As it will cause a very high register conflict rate.
+    // The tradeoff here is to reduce the freqency here. And if we are under spilling
+    // then no need to reduce that freqency as the register pressure is the most
+    // important factor.
+    if (tick % 12 == 0 || ctx.reservedSpillRegs != 0)
+      this->expireGRF(interval);
+    tick++;
+    // For some scalar byte register, it may be used as a destination register
+    // and the source is a scalar Dword. If that is the case, the byte register
+    // must get 4byte alignment register offset.
+    alignment = (alignment + 3) & ~3;
+    while ((grfOffset = ctx.allocate(size, alignment)) == 0) {
+      const bool success = this->expireGRF(interval);
+      if (success == false) {
+        if (spillAtInterval(interval, size, alignment) == false)
+          return 0;
+      }
+    }
+    return grfOffset;
+  }
+
+  INLINE bool GenRegAllocator::Opaque::allocate(Selection &selection) {
+    using namespace ir;
+    if (ctx.reservedSpillRegs != 0) {
+      reservedReg = ctx.allocate(ctx.reservedSpillRegs * GEN_REG_SIZE, GEN_REG_SIZE);
+      reservedReg /= GEN_REG_SIZE;
+    } else {
+      reservedReg = 0;
+    }
+    // schedulePreRegAllocation(ctx, selection);
+
+    // Now start the linear scan allocation
+    for (uint32_t regID = 0; regID < ctx.sel->getRegNum(); ++regID)
+      this->intervals.push_back(ir::Register(regID));
+
+    // Allocate the special registers (only those which are actually used)
+    this->allocatePayloadRegs();
+
+    // Group and barrier IDs are always allocated by the hardware in r0
+    RA.insert(std::make_pair(ocl::groupid0,  1*sizeof(float))); // r0.1
+    RA.insert(std::make_pair(ocl::groupid1,  6*sizeof(float))); // r0.6
+    RA.insert(std::make_pair(ocl::groupid2,  7*sizeof(float))); // r0.7
+    RA.insert(std::make_pair(ocl::barrierid, 2*sizeof(float))); // r0.2
+
+    // block IP used to handle the mask in SW is always allocated
+
+    // Compute the intervals
+    int32_t insnID = 0;
+    for (auto &block : *selection.blockList) {
+      int32_t lastID = insnID;
+      int32_t firstID = insnID;
+      // Update the intervals of each used register. Note that we do not
+      // register allocate R0, so we skip all sub-registers in r0
+      RegIntervalMap *boolsMap = new RegIntervalMap;
+      if (block.isLargeBlock)
+        flag0ReservedBlocks.insert(&block);
+      for (auto &insn : block.insnList) {
+        const uint32_t srcNum = insn.srcNum, dstNum = insn.dstNum;
+        insn.ID  = insnID;
+        for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+          const GenRegister &selReg = insn.src(srcID);
+          const ir::Register reg = selReg.reg();
+          if (selReg.file != GEN_GENERAL_REGISTER_FILE ||
+              reg == ir::ocl::barrierid ||
+              reg == ir::ocl::groupid0  ||
+              reg == ir::ocl::groupid1  ||
+              reg == ir::ocl::groupid2)
+            continue;
+          this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
+          this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
+        }
+        for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+          const GenRegister &selReg = insn.dst(dstID);
+          const ir::Register reg = selReg.reg();
+          if (selReg.file != GEN_GENERAL_REGISTER_FILE ||
+              reg == ir::ocl::barrierid ||
+              reg == ir::ocl::groupid0 ||
+              reg == ir::ocl::groupid1 ||
+              reg == ir::ocl::groupid2)
+            continue;
+          this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
+          this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
+        }
+
+        // OK, a flag is used as a predicate or a conditional modifier
+        if (insn.state.physicalFlag == 0) {
+          const ir::Register reg = ir::Register(insn.state.flagIndex);
+          this->intervals[reg].minID = std::min(this->intervals[reg].minID, insnID);
+          this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, insnID);
+          // Check whether this is a pure flag booleans candidate.
+          if (insn.state.grfFlag == 0)
+            flagBooleans.insert(reg);
+          GBE_ASSERT(ctx.sel->getRegisterFamily(reg) == ir::FAMILY_BOOL);
+          // update the bool register's per-BB's interval data
+          if (boolsMap->find(reg) == boolsMap->end()) {
+            GenRegInterval boolInterval(reg);
+            boolsMap->insert(std::make_pair(reg, boolInterval));
+          }
+          boolsMap->find(reg)->second.minID = std::min(boolsMap->find(reg)->second.minID, insnID);
+          boolsMap->find(reg)->second.maxID = std::max(boolsMap->find(reg)->second.maxID, insnID);
+          if (&insn == block.insnList.back() &&
+              insn.opcode == SEL_OP_JMPI &&
+              insn.state.predicate != GEN_PREDICATE_NONE) {
+            // If this is the last instruction and is a predicated JMPI.
+            // We must extent its liveness before any other instrution.
+            // As we need to allocate f0 to it, and need to keep the f0
+            // unchanged during the block. The root cause is this instruction
+            // is out-of the if/endif region, so we have to borrow the f0
+            // to get correct bits for all channels.
+            boolsMap->find(reg)->second.minID = 0;
+            if (flag0ReservedBlocks.contains(&block))
+              flag0ReservedBlocks.erase(&block);
+          }
+        } else {
+          // Make sure that instruction selection stage didn't use physiacl flags incorrectly.
+          GBE_ASSERT ((insn.opcode == SEL_OP_LABEL ||
+                       insn.opcode == SEL_OP_IF ||
+                       insn.opcode == SEL_OP_JMPI ||
+                       insn.state.predicate == GEN_PREDICATE_NONE ||
+                       (block.hasBarrier && insn.opcode == SEL_OP_MOV) ||
+                       (insn.state.flag == 0 && insn.state.subFlag == 1)));
+        }
+        lastID = insnID;
+        insnID++;
+      }
+
+      // All registers alive at the begining of the block must update their intervals.
+      const ir::BasicBlock *bb = block.bb;
+      for (auto reg : ctx.getLiveIn(bb))
+        this->intervals[reg].minID = std::min(this->intervals[reg].minID, firstID);
+
+      // All registers alive at the end of the block must have their intervals
+      // updated as well
+      for (auto reg : ctx.getLiveOut(bb))
+        this->intervals[reg].maxID = std::max(this->intervals[reg].maxID, lastID);
+
+      if (boolsMap->size() > 0)
+        boolIntervalsMap.insert(std::make_pair(&block, boolsMap));
+      else
+        delete boolsMap;
+    }
+
+    this->intervals[ocl::retVal].minID = INT_MAX;
+    this->intervals[ocl::retVal].maxID = -INT_MAX;
+
+    // Allocate all the vectors first since they need to be contiguous
+    this->allocateVector(selection);
+
+    // First we try to put all booleans registers into flags
+    this->allocateFlags(selection);
+
+    // Sort both intervals in starting point and ending point increasing orders
+    const uint32_t regNum = ctx.sel->getRegNum();
+    this->starting.resize(regNum);
+    this->ending.resize(regNum);
+    for (uint32_t regID = 0; regID < regNum; ++regID)
+      this->starting[regID] = this->ending[regID] = &intervals[regID];
+    std::sort(this->starting.begin(), this->starting.end(), cmp<true>);
+    std::sort(this->ending.begin(), this->ending.end(), cmp<false>);
+
+    // Remove the registers that were not allocated
+    this->expiringID = 0;
+    while (this->expiringID < regNum) {
+      const GenRegInterval *interval = ending[this->expiringID];
+      if (interval->maxID == -INT_MAX)
+        this->expiringID++;
+      else
+        break;
+    }
+
+    // Allocate all the GRFs now (regular register and boolean that are not in
+    // flag registers)
+    return this->allocateGRFs(selection);
+  }
+
+  INLINE void GenRegAllocator::Opaque::outputAllocation(void) {
+    using namespace std;
+    cout << "## register allocation ##" << endl;
+    for(auto &i : RA) {
+        ir::Register vReg = (ir::Register)i.first;
+        ir::RegisterFamily family;
+        uint32_t regSize;
+        getRegAttrib(vReg, regSize, &family);
+        int offst = (int)i.second;// / sizeof(float);
+        int reg = offst / 32;
+        int subreg = (offst % 32) / regSize;
+        cout << "%" << setiosflags(ios::left) << setw(8) << vReg
+             << "g" << setiosflags(ios::left) << setw(3) << reg << "."
+             << setiosflags(ios::left) << setw(3) << subreg << ir::getFamilyName(family)
+             << "  " << setw(-3) << regSize  << "B\t"
+             << "[  " << setw(8) << this->intervals[(uint)vReg].minID
+             << " -> " << setw(8) << this->intervals[(uint)vReg].maxID
+             << "]" << endl;
+    }
+    if (!spilledRegs.empty())
+      cout << "## spilled registers: " << spilledRegs.size() << endl;
+    for(auto it = spilledRegs.begin(); it != spilledRegs.end(); it++) {
+      ir::Register vReg = it->first;
+      ir::RegisterFamily family;
+      uint32_t regSize;
+      getRegAttrib(vReg, regSize, &family);
+      cout << "%" << setiosflags(ios::left) << setw(8) << vReg
+           << "@" << setw(8) << it->second.addr
+           << "  " << ir::getFamilyName(family)
+           <<  "  " << setw(-3) << regSize << "B\t"
+           << "[  " << setw(8) << this->intervals[(uint)vReg].minID
+           << " -> " << setw(8) << this->intervals[(uint)vReg].maxID
+           << "]" << endl;
+    }
+    cout << endl;
+  }
+
+  INLINE GenRegister setGenReg(const GenRegister &src, uint32_t grfOffset) {
+    GenRegister dst;
+    dst = src;
+    dst.physical = 1;
+    dst.nr = grfOffset / GEN_REG_SIZE;
+    dst.subnr = grfOffset % GEN_REG_SIZE;
+    return dst;
+  }
+
+  INLINE GenRegister GenRegAllocator::Opaque::genReg(const GenRegister &reg) {
+    if (reg.file == GEN_GENERAL_REGISTER_FILE) {
+      if(reg.physical == 1) {
+        return reg;
+      }
+      GBE_ASSERT(RA.contains(reg.reg()) != false);
+      const uint32_t grfOffset = RA.find(reg.reg())->second;
+      const uint32_t suboffset = reg.subphysical ? reg.subnr : 0;
+      const GenRegister dst = setGenReg(reg, grfOffset + suboffset);
+      if (reg.quarter != 0)
+        return GenRegister::Qn(dst, reg.quarter);
+      else
+        return dst;
+    }
+    else
+      return reg;
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Register allocator public implementation
+  /////////////////////////////////////////////////////////////////////////////
+
+  GenRegAllocator::GenRegAllocator(GenContext &ctx) {
+    this->opaque = GBE_NEW(GenRegAllocator::Opaque, ctx);
+  }
+
+  GenRegAllocator::~GenRegAllocator(void) {
+    GBE_DELETE(this->opaque);
+  }
+
+  bool GenRegAllocator::allocate(Selection &selection) {
+    return this->opaque->allocate(selection);
+  }
+
+  GenRegister GenRegAllocator::genReg(const GenRegister &reg) {
+    return this->opaque->genReg(reg);
+  }
+
+  void GenRegAllocator::outputAllocation(void) {
+    this->opaque->outputAllocation();
+  }
+
+  uint32_t GenRegAllocator::getRegSize(ir::Register reg) {
+     uint32_t regSize; 
+     this->opaque->getRegAttrib(reg, regSize); 
+     return regSize;
+  }
+
+} /* namespace gbe */
+
diff --git a/backend/src/backend/gen_reg_allocation.hpp b/backend/src/backend/gen_reg_allocation.hpp
new file mode 100644
index 0000000..e41f503
--- /dev/null
+++ b/backend/src/backend/gen_reg_allocation.hpp
@@ -0,0 +1,73 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file gen_reg_allocation.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_GEN_REG_ALLOCATION_HPP__
+#define __GBE_GEN_REG_ALLOCATION_HPP__
+
+#include "ir/register.hpp"
+#include "backend/gen_register.hpp"
+
+namespace gbe
+{
+  class Selection;      // Pre-register allocation code generation
+  class GenRegister;    // Pre-register allocation Gen register
+  struct GenRegInterval; // Liveness interval for each register
+  class GenContext;     // Gen specific context
+
+  typedef struct SpillRegTag {
+    bool isTmpReg;
+    int32_t addr;
+  } SpillRegTag;
+
+  typedef map<ir::Register, SpillRegTag> SpilledRegs;
+
+  /*! Register allocate (i.e. virtual to physical register mapping) */
+  class GenRegAllocator
+  {
+  public:
+    /*! Initialize the register allocator */
+    GenRegAllocator(GenContext &ctx);
+    /*! Release all taken resources */
+    ~GenRegAllocator(void);
+    /*! Perform the register allocation */
+    bool allocate(Selection &selection);
+    /*! Virtual to physical translation */
+    GenRegister genReg(const GenRegister &reg);
+    /*! Output the register allocation */
+    void outputAllocation(void);
+    /*! Get register actual size in byte. */
+    uint32_t getRegSize(ir::Register reg);
+  private:
+    /*! Actual implementation of the register allocator (use Pimpl) */
+    class Opaque;
+    /*! Created and destroyed in cpp */
+    Opaque *opaque;
+    /*! Use custom allocator */
+    GBE_CLASS(GenRegAllocator);
+  };
+
+} /* namespace gbe */
+
+#endif /* __GBE_GEN_REG_ALLOCATION_HPP__ */
+
diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp
new file mode 100644
index 0000000..da58c06
--- /dev/null
+++ b/backend/src/backend/gen_register.hpp
@@ -0,0 +1,1060 @@
+/*
+ * Copyright 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith at tungstengraphics.com>
+  */
+/**
+ * \file gen_register.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GEN_REGISTER_HPP__
+#define __GEN_REGISTER_HPP__
+
+#include "backend/gen_defs.hpp"
+#include "ir/register.hpp"
+#include "sys/platform.hpp"
+
+namespace gbe
+{
+
+  /*! Type size in bytes for each Gen type */
+  INLINE int typeSize(uint32_t type) {
+    switch(type) {
+      case GEN_TYPE_DF:
+      case GEN_TYPE_UL:
+      case GEN_TYPE_L:
+        return 8;
+      case GEN_TYPE_UD:
+      case GEN_TYPE_D:
+      case GEN_TYPE_F:
+        return 4;
+      case GEN_TYPE_UW:
+      case GEN_TYPE_W:
+        return 2;
+      case GEN_TYPE_UB:
+      case GEN_TYPE_B:
+        return 1;
+      default:
+        assert(0);
+        return 0;
+    }
+  }
+
+  /*! Convert a hstride to a number of element */
+  INLINE uint32_t stride(uint32_t stride) {
+    switch (stride) {
+      case 0: return 0;
+      case 1: return 1;
+      case 2: return 2;
+      case 3: return 4;
+      case 4: return 8;
+      case 5: return 16;
+      default: assert(0); return 0;
+    }
+  }
+
+  /*! Encode the instruction state. Note that the flag register can be either
+   *  physical (i.e. a real Gen flag) or a virtual boolean register. The flag
+   *  register allocation will turn all virtual boolean registers into flag
+   *  registers
+   */
+  class GenInstructionState
+  {
+  public:
+    INLINE GenInstructionState(uint32_t simdWidth = 8) {
+      this->execWidth = simdWidth;
+      this->quarterControl = GEN_COMPRESSION_Q1;
+      this->nibControl = 0;
+      this->accWrEnable = 0;
+      this->noMask = 0;
+      this->flag = 0;
+      this->subFlag = 0;
+      this->grfFlag = 1;
+      this->externFlag = 0;
+      this->modFlag = 0;
+      this->flagGen = 0;
+      this->predicate = GEN_PREDICATE_NONE;
+      this->inversePredicate = 0;
+      this->physicalFlag = 1;
+      this->flagIndex = 0;
+      this->saturate = GEN_MATH_SATURATE_NONE;
+    }
+    uint32_t physicalFlag:1; //!< Physical or virtual flag register
+    uint32_t flag:1;         //!< Only if physical flag,
+    uint32_t subFlag:1;      //!< Only if physical flag
+    uint32_t flagIndex:16;   //!< Only if virtual flag (index of the register)
+    uint32_t grfFlag:1;      //!< Only if virtual flag, 0 means we do not need to allocate GRF.
+    uint32_t externFlag:1;   //!< Only if virtual flag, 1 means this flag is from external BB.
+    uint32_t modFlag:1;      //!< Only if virtual flag, 1 means will modify flag.
+    uint32_t flagGen:1;      //!< Only if virtual flag, 1 means the gen_context stage may need to
+                             //!< generate the flag.
+    uint32_t execWidth:5;
+    uint32_t quarterControl:1;
+    uint32_t nibControl:1;
+    uint32_t accWrEnable:1;
+    uint32_t noMask:1;
+    uint32_t predicate:4;
+    uint32_t inversePredicate:1;
+    uint32_t saturate:1;
+    void chooseNib(int nib) {
+      switch (nib) {
+        case 0:
+          quarterControl = 0;
+          nibControl = 0;
+          break;
+        case 1:
+          quarterControl = 0;
+          nibControl = 1;
+          break;
+        case 2:
+          quarterControl = 1;
+          nibControl = 0;
+          break;
+        case 3:
+          quarterControl = 1;
+          nibControl = 1;
+          break;
+        default:
+          NOT_IMPLEMENTED;
+      }
+    }
+    void useFlag(int nr, int subnr) {
+      flag = nr;
+      subFlag = subnr;
+      physicalFlag = 1;
+    }
+  };
+
+  /*! This is a book-keeping structure used to encode both virtual and physical
+   *  registers
+   */
+  class GenRegister
+  {
+  public:
+    /*! Empty constructor */
+    INLINE GenRegister(void) {}
+
+    /*! General constructor */
+    INLINE GenRegister(uint32_t file,
+                       ir::Register reg,
+                       uint32_t type,
+                       uint32_t vstride,
+                       uint32_t width,
+                       uint32_t hstride)
+    {
+      this->type = type;
+      this->file = file;
+      this->physical = 0;
+      this->subphysical = 0;
+      this->value.reg = reg;
+      this->negation = 0;
+      this->absolute = 0;
+      this->vstride = vstride;
+      this->width = width;
+      this->hstride = hstride;
+      this->quarter = 0;
+      this->nr = this->subnr = 0;
+      this->address_mode = GEN_ADDRESS_DIRECT;
+    }
+
+    /*! For specific physical registers only */
+    INLINE GenRegister(uint32_t file,
+                       uint32_t nr,
+                       uint32_t subnr,
+                       uint32_t type,
+                       uint32_t vstride,
+                       uint32_t width,
+                       uint32_t hstride)
+    {
+      this->type = type;
+      this->file = file;
+      this->nr = nr;
+      this->physical = 1;
+      this->subphysical = 1;
+      this->subnr = subnr * typeSize(type);
+      this->negation = 0;
+      this->absolute = 0;
+      this->vstride = vstride;
+      this->width = width;
+      this->hstride = hstride;
+      this->quarter = 0;
+      this->address_mode = GEN_ADDRESS_DIRECT;
+    }
+
+    /*! Return the IR virtual register */
+    INLINE ir::Register reg(void) const { return ir::Register(value.reg); }
+
+    /*! For immediates or virtual register */
+    union {
+      double df;
+      float f;
+      int32_t d;
+      uint32_t ud;
+      uint16_t reg;
+      int64_t i64;
+    } value;
+
+    uint32_t nr:8;         //!< Just for some physical registers (acc, null)
+    uint32_t subnr:8;      //!< Idem
+    uint32_t physical:1;   //!< 1 if physical, 0 otherwise
+    uint32_t subphysical:1;//!< 1 if subnr is physical, 0 otherwise
+    uint32_t type:4;       //!< Gen type
+    uint32_t file:2;       //!< Register file
+    uint32_t negation:1;   //!< For source
+    uint32_t absolute:1;   //!< For source
+    uint32_t vstride:4;    //!< Vertical stride
+    uint32_t width:3;        //!< Width
+    uint32_t hstride:2;      //!< Horizontal stride
+    uint32_t quarter:1;      //!< To choose which part we want (Q1 / Q2)
+    uint32_t address_mode:1; //!< direct or indirect
+
+    static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
+      GenRegister r = reg;
+      r.nr += nr;
+      r.subnr += subnr;
+      return r;
+    }
+
+    // split a DWORD register into unpacked Byte or Short register
+    static INLINE GenRegister splitReg(GenRegister reg, uint32_t count, uint32_t sub_part) {
+      GenRegister r = reg;
+      GBE_ASSERT(count == 4 || count == 2);
+      GBE_ASSERT(reg.type == GEN_TYPE_UD || reg.type == GEN_TYPE_D);
+
+      if(reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+        GBE_ASSERT(reg.hstride == GEN_HORIZONTAL_STRIDE_1);
+        r.hstride = count == 4 ? GEN_HORIZONTAL_STRIDE_4 : GEN_HORIZONTAL_STRIDE_2;
+      }
+      if(count == 4) {
+        r.type = reg.type == GEN_TYPE_UD ? GEN_TYPE_UB : GEN_TYPE_B;
+        r.vstride = GEN_VERTICAL_STRIDE_32;
+      } else {
+        r.type = reg.type == GEN_TYPE_UD ? GEN_TYPE_UW : GEN_TYPE_W;
+        r.vstride = GEN_VERTICAL_STRIDE_16;
+      }
+
+      r.subnr += sub_part*typeSize(r.type);
+      r.nr += r.subnr / 32;
+      r.subnr %= 32;
+
+      return r;
+    }
+
+    INLINE bool isint64(void) const {
+      if ((type == GEN_TYPE_UL || type == GEN_TYPE_L) && file == GEN_GENERAL_REGISTER_FILE)
+        return true;
+      return false;
+    }
+
+    INLINE bool isimmdf(void) const {
+      if (type == GEN_TYPE_DF && file == GEN_IMMEDIATE_VALUE)
+        return true;
+      return false;
+    }
+
+    INLINE GenRegister top_half(int simdWidth) const {
+      GBE_ASSERT(isint64());
+      GenRegister reg = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D);
+
+      if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+        reg.subnr += simdWidth * typeSize(reg.type) * hstride_size(reg);
+        reg.nr += reg.subnr / 32;
+        reg.subnr %= 32;
+      } else {
+        reg.subnr += typeSize(reg.type);
+        reg.nr += reg.subnr/32;
+        reg.subnr %= 32;
+      }
+      return reg;
+    }
+
+    INLINE GenRegister bottom_half(void) const {
+      GBE_ASSERT(isint64());
+      GenRegister r = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D);
+      return r;
+    }
+
+    INLINE bool is_signed_int(void) const {
+      if ((type == GEN_TYPE_B || type == GEN_TYPE_W || type == GEN_TYPE_D || type == GEN_TYPE_L) && file == GEN_GENERAL_REGISTER_FILE)
+        return true;
+      return false;
+    }
+
+    INLINE bool isdf(void) const {
+      if (type == GEN_TYPE_DF && file == GEN_GENERAL_REGISTER_FILE)
+        return true;
+      return false;
+    }
+
+    INLINE int flag_nr(void) const {
+      assert(file == GEN_ARCHITECTURE_REGISTER_FILE);
+      assert(nr >= GEN_ARF_FLAG && nr < GEN_ARF_FLAG + 2);
+      return nr & 15;
+    }
+
+    INLINE int flag_subnr(void) const {
+      return subnr / typeSize(type);
+    }
+
+    static INLINE GenRegister h2(GenRegister reg) {
+      GenRegister r = reg;
+      if(r.hstride != GEN_HORIZONTAL_STRIDE_0)
+        r.hstride = GEN_HORIZONTAL_STRIDE_2;
+      return r;
+    }
+
+    static INLINE GenRegister QnVirtual(GenRegister reg, uint32_t quarter) {
+      GBE_ASSERT(reg.physical == 0);
+      if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register
+        return reg;
+      else {
+        reg.quarter = quarter;
+        return reg;
+      }
+    }
+
+    static INLINE GenRegister QnPhysical(GenRegister reg, uint32_t quarter) {
+      GBE_ASSERT(reg.physical);
+      if (reg.hstride == GEN_HORIZONTAL_STRIDE_0) // scalar register
+        return reg;
+      else {
+        const uint32_t typeSz = typeSize(reg.type);
+        const uint32_t horizontal = stride(reg.hstride);
+        const uint32_t grfOffset = reg.nr*GEN_REG_SIZE + reg.subnr;
+        const uint32_t nextOffset = grfOffset + 8*quarter*horizontal*typeSz;
+        reg.nr = nextOffset / GEN_REG_SIZE;
+        reg.subnr = (nextOffset % GEN_REG_SIZE);
+        return reg;
+      }
+    }
+
+    static INLINE GenRegister Qn(GenRegister reg, uint32_t quarter) {
+      if (reg.physical)
+        return QnPhysical(reg, quarter);
+      else
+        return QnVirtual(reg, quarter);
+    }
+
+    static INLINE GenRegister vec16(uint32_t file, ir::Register reg) {
+      return GenRegister(file,
+                         reg,
+                         GEN_TYPE_F,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+    static INLINE GenRegister vec8(uint32_t file, ir::Register reg) {
+      return GenRegister(file,
+                         reg,
+                         GEN_TYPE_F,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+    static INLINE GenRegister vec4(uint32_t file, ir::Register reg) {
+      return GenRegister(file,
+                         reg,
+                         GEN_TYPE_F,
+                         GEN_VERTICAL_STRIDE_4,
+                         GEN_WIDTH_4,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+    static INLINE GenRegister vec2(uint32_t file, ir::Register reg) {
+      return GenRegister(file,
+                         reg,
+                         GEN_TYPE_F,
+                         GEN_VERTICAL_STRIDE_2,
+                         GEN_WIDTH_2,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+    static INLINE GenRegister vec1(uint32_t file, ir::Register reg) {
+      return GenRegister(file,
+                         reg,
+                         GEN_TYPE_F,
+                         GEN_VERTICAL_STRIDE_0,
+                         GEN_WIDTH_1,
+                         GEN_HORIZONTAL_STRIDE_0);
+    }
+
+    static INLINE GenRegister retype(GenRegister reg, uint32_t type) {
+      reg.type = type;
+      return reg;
+    }
+
+    static INLINE GenRegister df16(uint32_t file, ir::Register reg) {
+      return retype(vec16(file, reg), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df8(uint32_t file, ir::Register reg) {
+      return retype(vec8(file, reg), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df1(uint32_t file, ir::Register reg) {
+      return retype(vec1(file, reg), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister ud16(uint32_t file, ir::Register reg) {
+      return retype(vec16(file, reg), GEN_TYPE_UD);
+    }
+
+    static INLINE GenRegister ud8(uint32_t file, ir::Register reg) {
+      return retype(vec8(file, reg), GEN_TYPE_UD);
+    }
+
+    static INLINE GenRegister ud1(uint32_t file, ir::Register reg) {
+      return retype(vec1(file, reg), GEN_TYPE_UD);
+    }
+
+    static INLINE GenRegister d8(uint32_t file, ir::Register reg) {
+      return retype(vec8(file, reg), GEN_TYPE_D);
+    }
+
+    static INLINE GenRegister uw16(uint32_t file, ir::Register reg) {
+      return retype(vec16(file, reg), GEN_TYPE_UW);
+    }
+
+    static INLINE GenRegister uw8(uint32_t file, ir::Register reg) {
+      return retype(vec8(file, reg), GEN_TYPE_UW);
+    }
+
+    static INLINE GenRegister uw1(uint32_t file, ir::Register reg) {
+      return retype(vec1(file, reg), GEN_TYPE_UW);
+    }
+
+    static INLINE GenRegister ub16(uint32_t file, ir::Register reg) {
+      return GenRegister(file,
+                         reg,
+                         GEN_TYPE_UB,
+                         GEN_VERTICAL_STRIDE_16,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_2);
+    }
+
+    static INLINE GenRegister ub8(uint32_t file, ir::Register reg) {
+      return GenRegister(file,
+                         reg,
+                         GEN_TYPE_UB,
+                         GEN_VERTICAL_STRIDE_16,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_2);
+    }
+
+    static INLINE GenRegister ub1(uint32_t file, ir::Register reg) {
+      return retype(vec1(file, reg), GEN_TYPE_UB);
+    }
+
+    static INLINE GenRegister unpacked_uw(ir::Register reg, bool uniform = false) {
+        return GenRegister(GEN_GENERAL_REGISTER_FILE,
+                           reg,
+                           GEN_TYPE_UW,
+                           uniform ? GEN_VERTICAL_STRIDE_0 : GEN_VERTICAL_STRIDE_16,
+                           uniform ? GEN_WIDTH_1 : GEN_WIDTH_8,
+                           uniform ? GEN_HORIZONTAL_STRIDE_0 : GEN_HORIZONTAL_STRIDE_2);
+    }
+
+    static INLINE GenRegister unpacked_ub(ir::Register reg, bool uniform = false) {
+      return GenRegister(GEN_GENERAL_REGISTER_FILE,
+                         reg,
+                         GEN_TYPE_UB,
+                         uniform ? GEN_VERTICAL_STRIDE_0 : GEN_VERTICAL_STRIDE_32,
+                         uniform ? GEN_WIDTH_1 : GEN_WIDTH_8,
+                         uniform ? GEN_HORIZONTAL_STRIDE_0 : GEN_HORIZONTAL_STRIDE_4);
+    }
+
+    static INLINE GenRegister imm(uint32_t type) {
+      return GenRegister(GEN_IMMEDIATE_VALUE,
+                         0,
+                         0,
+                         type,
+                         GEN_VERTICAL_STRIDE_0,
+                         GEN_WIDTH_1,
+                         GEN_HORIZONTAL_STRIDE_0);
+    }
+
+    static INLINE GenRegister immint64(int64_t i) {
+      GenRegister immediate = imm(GEN_TYPE_L);
+      immediate.value.i64 = i;
+      return immediate;
+    }
+
+    static INLINE GenRegister immdf(double df) {
+      GenRegister immediate = imm(GEN_TYPE_DF);
+      immediate.value.df = df;
+      return immediate;
+    }
+
+    static INLINE GenRegister immf(float f) {
+      GenRegister immediate = imm(GEN_TYPE_F);
+      immediate.value.f = f;
+      return immediate;
+    }
+
+    static INLINE GenRegister immd(int d) {
+      GenRegister immediate = imm(GEN_TYPE_D);
+      immediate.value.d = d;
+      return immediate;
+    }
+
+    static INLINE GenRegister immud(uint32_t ud) {
+      GenRegister immediate = imm(GEN_TYPE_UD);
+      immediate.value.ud = ud;
+      return immediate;
+    }
+
+    static INLINE GenRegister immuw(uint16_t uw) {
+      GenRegister immediate = imm(GEN_TYPE_UW);
+      immediate.value.ud = uw;
+      return immediate;
+    }
+
+    static INLINE GenRegister immw(int16_t w) {
+      GenRegister immediate = imm(GEN_TYPE_W);
+      immediate.value.d = w;
+      return immediate;
+    }
+
+    static INLINE GenRegister immv(uint32_t v) {
+      GenRegister immediate = imm(GEN_TYPE_V);
+      immediate.vstride = GEN_VERTICAL_STRIDE_0;
+      immediate.width = GEN_WIDTH_8;
+      immediate.hstride = GEN_HORIZONTAL_STRIDE_1;
+      immediate.value.ud = v;
+      return immediate;
+    }
+
+    static INLINE GenRegister immvf(uint32_t v) {
+      GenRegister immediate = imm(GEN_TYPE_VF);
+      immediate.vstride = GEN_VERTICAL_STRIDE_0;
+      immediate.width = GEN_WIDTH_4;
+      immediate.hstride = GEN_HORIZONTAL_STRIDE_1;
+      immediate.value.ud = v;
+      return immediate;
+    }
+
+    static INLINE GenRegister immvf4(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3) {
+      GenRegister immediate = imm(GEN_TYPE_VF);
+      immediate.vstride = GEN_VERTICAL_STRIDE_0;
+      immediate.width = GEN_WIDTH_4;
+      immediate.hstride = GEN_HORIZONTAL_STRIDE_1;
+      immediate.value.ud = ((v0 << 0) | (v1 << 8) | (v2 << 16) | (v3 << 24));
+      return immediate;
+    }
+
+    static INLINE GenRegister f1grf(ir::Register reg) {
+      return vec1(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister f2grf(ir::Register reg) {
+      return vec2(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister f4grf(ir::Register reg) {
+      return vec4(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister f8grf(ir::Register reg) {
+      return vec8(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister f16grf(ir::Register reg) {
+      return vec16(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister df1grf(ir::Register reg) {
+      return df1(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister df8grf(ir::Register reg) {
+      return df8(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister df16grf(ir::Register reg) {
+      return df16(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister ud16grf(ir::Register reg) {
+      return ud16(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister ud8grf(ir::Register reg) {
+      return ud8(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister ud1grf(ir::Register reg) {
+      return ud1(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister uw1grf(ir::Register reg) {
+      return uw1(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister uw8grf(ir::Register reg) {
+      return uw8(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister uw16grf(ir::Register reg) {
+      return uw16(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister ub1grf(ir::Register reg) {
+      return ub1(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister ub8grf(ir::Register reg) {
+      return ub8(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister ub16grf(ir::Register reg) {
+      return ub16(GEN_GENERAL_REGISTER_FILE, reg);
+    }
+
+    static INLINE GenRegister null(void) {
+      return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+                         GEN_ARF_NULL,
+                         0,
+                         GEN_TYPE_F,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+    static INLINE GenRegister nullud(void) {
+      return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+                         GEN_ARF_NULL,
+                         0,
+                         GEN_TYPE_UD,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+
+    static INLINE bool isNull(GenRegister reg) {
+      return (reg.file == GEN_ARCHITECTURE_REGISTER_FILE
+              && reg.nr == GEN_ARF_NULL);
+    }
+
+    static INLINE GenRegister vec1(GenRegister reg) {
+      reg.width = GEN_WIDTH_1;
+      reg.hstride = GEN_HORIZONTAL_STRIDE_0;
+      reg.vstride = GEN_VERTICAL_STRIDE_0;
+      return reg;
+    }
+
+    static INLINE GenRegister acc(void) {
+      return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+                         GEN_ARF_ACCUMULATOR,
+                         0,
+                         GEN_TYPE_F,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+    static INLINE GenRegister ip(void) {
+      return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+                         GEN_ARF_IP,
+                         0,
+                         GEN_TYPE_D,
+                         GEN_VERTICAL_STRIDE_4,
+                         GEN_WIDTH_1,
+                         GEN_HORIZONTAL_STRIDE_0);
+    }
+
+    static INLINE GenRegister notification1(void) {
+      return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+                         GEN_ARF_NOTIFICATION_COUNT,
+                         0,
+                         GEN_TYPE_UD,
+                         GEN_VERTICAL_STRIDE_0,
+                         GEN_WIDTH_1,
+                         GEN_HORIZONTAL_STRIDE_0);
+    }
+
+    static INLINE GenRegister flag(uint32_t nr, uint32_t subnr) {
+      return GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+                         GEN_ARF_FLAG | nr,
+                         subnr,
+                         GEN_TYPE_UW,
+                         GEN_VERTICAL_STRIDE_0,
+                         GEN_WIDTH_1,
+                         GEN_HORIZONTAL_STRIDE_0);
+    }
+
+    static INLINE GenRegister next(GenRegister reg) {
+      if (reg.physical)
+        reg.nr++;
+      else
+        reg.quarter++;
+      return reg;
+    }
+
+    /*! Build an indirectly addressed source */
+    static INLINE GenRegister indirect(uint32_t type, uint32_t subnr, uint32_t width) {
+      GenRegister reg;
+      reg.type = type;
+      reg.file = GEN_GENERAL_REGISTER_FILE;
+      reg.address_mode = GEN_ADDRESS_REGISTER_INDIRECT_REGISTER;
+      reg.width = width;
+      reg.subnr = subnr;
+      reg.nr = 0;
+      reg.negation = 0;
+      reg.absolute = 0;
+      reg.vstride = 0;
+      reg.hstride = 0;
+      return reg;
+    }
+
+    static INLINE GenRegister vec16(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return GenRegister(file,
+                         nr,
+                         subnr,
+                         GEN_TYPE_F,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+    static INLINE GenRegister vec8(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return GenRegister(file,
+                         nr,
+                         subnr,
+                         GEN_TYPE_F,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+    static INLINE GenRegister vec4(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return GenRegister(file,
+                         nr,
+                         subnr,
+                         GEN_TYPE_F,
+                         GEN_VERTICAL_STRIDE_4,
+                         GEN_WIDTH_4,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+    static INLINE GenRegister vec2(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return GenRegister(file,
+                    nr,
+                    subnr,
+                    GEN_TYPE_F,
+                    GEN_VERTICAL_STRIDE_2,
+                    GEN_WIDTH_2,
+                    GEN_HORIZONTAL_STRIDE_1);
+    }
+
+    static INLINE GenRegister vec1(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return GenRegister(file,
+                    nr,
+                    subnr,
+                    GEN_TYPE_F,
+                    GEN_VERTICAL_STRIDE_0,
+                    GEN_WIDTH_1,
+                    GEN_HORIZONTAL_STRIDE_0);
+    }
+
+    static INLINE int hstride_size(GenRegister reg) {
+      switch (reg.hstride) {
+        case GEN_HORIZONTAL_STRIDE_0: return 0;
+        case GEN_HORIZONTAL_STRIDE_1: return 1;
+        case GEN_HORIZONTAL_STRIDE_2: return 2;
+        case GEN_HORIZONTAL_STRIDE_4: return 4;
+        default: NOT_IMPLEMENTED; return 0;
+      }
+    }
+
+    static INLINE GenRegister suboffset(GenRegister reg, uint32_t delta) {
+      if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) {
+        reg.subnr += delta * typeSize(reg.type) * hstride_size(reg);
+        reg.nr += reg.subnr / 32;
+        reg.subnr %= 32;
+      }
+      return reg;
+    }
+
+    static INLINE GenRegister df16(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec16(file, nr, subnr), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df8(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec8(file, nr, subnr), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister df1(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec1(file, nr, subnr), GEN_TYPE_DF);
+    }
+
+    static INLINE GenRegister ud16(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec16(file, nr, subnr), GEN_TYPE_UD);
+    }
+
+    static INLINE GenRegister ud8(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec8(file, nr, subnr), GEN_TYPE_UD);
+    }
+
+    static INLINE GenRegister ud1(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec1(file, nr, subnr), GEN_TYPE_UD);
+    }
+
+    static INLINE GenRegister d8(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return retype(vec8(file, nr, subnr), GEN_TYPE_D);
+    }
+
+    static INLINE GenRegister uw16(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return suboffset(retype(vec16(file, nr, 0), GEN_TYPE_UW), subnr);
+    }
+
+    static INLINE GenRegister uw8(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return suboffset(retype(vec8(file, nr, 0), GEN_TYPE_UW), subnr);
+    }
+
+    static INLINE GenRegister uw1(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return suboffset(retype(vec1(file, nr, 0), GEN_TYPE_UW), subnr);
+    }
+
+    static INLINE GenRegister ub16(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return GenRegister(file,
+                         nr,
+                         subnr,
+                         GEN_TYPE_UB,
+                         GEN_VERTICAL_STRIDE_16,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_2);
+    }
+
+    static INLINE GenRegister ub8(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return GenRegister(file,
+                         nr,
+                         subnr,
+                         GEN_TYPE_UB,
+                         GEN_VERTICAL_STRIDE_16,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_2);
+    }
+
+    static INLINE GenRegister ub1(uint32_t file, uint32_t nr, uint32_t subnr) {
+      return suboffset(retype(vec1(file, nr, 0), GEN_TYPE_UB), subnr);
+    }
+
+    static INLINE GenRegister f1grf(uint32_t nr, uint32_t subnr) {
+      return vec1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister f2grf(uint32_t nr, uint32_t subnr) {
+      return vec2(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister f4grf(uint32_t nr, uint32_t subnr) {
+      return vec4(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister f8grf(uint32_t nr, uint32_t subnr) {
+      return vec8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister f16grf(uint32_t nr, uint32_t subnr) {
+      return vec16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister df16grf(uint32_t nr, uint32_t subnr) {
+      return df16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister df8grf(uint32_t nr, uint32_t subnr) {
+      return df8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister df1grf(uint32_t nr, uint32_t subnr) {
+      return df1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister ud16grf(uint32_t nr, uint32_t subnr) {
+      return ud16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister ud8grf(uint32_t nr, uint32_t subnr) {
+      return ud8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister ud1grf(uint32_t nr, uint32_t subnr) {
+      return ud1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister ud1arf(uint32_t nr, uint32_t subnr) {
+      return ud1(GEN_ARCHITECTURE_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister uw1grf(uint32_t nr, uint32_t subnr) {
+      return uw1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister uw8grf(uint32_t nr, uint32_t subnr) {
+      return uw8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister uw16grf(uint32_t nr, uint32_t subnr) {
+      return uw16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister ub1grf(uint32_t nr, uint32_t subnr) {
+      return ub1(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister ub8grf(uint32_t nr, uint32_t subnr) {
+      return ub8(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister ub16grf(uint32_t nr, uint32_t subnr) {
+      return ub16(GEN_GENERAL_REGISTER_FILE, nr, subnr);
+    }
+
+    static INLINE GenRegister unpacked_uw(uint32_t nr, uint32_t subnr) {
+      return GenRegister(GEN_GENERAL_REGISTER_FILE,
+                         nr,
+                         subnr,
+                         GEN_TYPE_UW,
+                         GEN_VERTICAL_STRIDE_16,
+                         GEN_WIDTH_8,
+                         GEN_HORIZONTAL_STRIDE_2);
+    }
+
+    static INLINE GenRegister packed_ud(uint32_t nr, uint32_t subnr) {
+      return GenRegister(GEN_GENERAL_REGISTER_FILE,
+                         nr,
+                         subnr,
+                         GEN_TYPE_UD,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_4,
+                         GEN_HORIZONTAL_STRIDE_1);
+    }
+
+    static INLINE GenRegister unpacked_ud(uint32_t nr, uint32_t subnr) {
+      return GenRegister(GEN_GENERAL_REGISTER_FILE,
+                         nr,
+                         subnr,
+                         GEN_TYPE_UD,
+                         GEN_VERTICAL_STRIDE_8,
+                         GEN_WIDTH_4,
+                         GEN_HORIZONTAL_STRIDE_2);
+    }
+
+    static INLINE GenRegister mask(uint32_t subnr) {
+      return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_MASK, subnr);
+    }
+
+    static INLINE GenRegister addr1(uint32_t subnr) {
+      return uw1(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_ADDRESS, subnr);
+    }
+
+    static INLINE GenRegister addr8(uint32_t subnr) {
+      return uw8(GEN_ARCHITECTURE_REGISTER_FILE, GEN_ARF_ADDRESS, subnr);
+    }
+
+    static INLINE GenRegister negate(GenRegister reg) {
+      if (reg.file != GEN_IMMEDIATE_VALUE)
+        reg.negation ^= 1;
+      else {
+        if (reg.type == GEN_TYPE_F)
+          reg.value.f = -reg.value.f;
+        else if (reg.type == GEN_TYPE_UD)
+          reg.value.ud = -reg.value.ud;
+        else if (reg.type == GEN_TYPE_D)
+          reg.value.d = -reg.value.d;
+        else if (reg.type == GEN_TYPE_UW) {
+          const uint16_t uw = reg.value.ud & 0xffff;
+          reg = GenRegister::immuw(-uw);
+        } else if (reg.type == GEN_TYPE_W) {
+          const uint16_t uw = reg.value.ud & 0xffff;
+          reg = GenRegister::immw(-(int16_t)uw);
+        } else
+          NOT_SUPPORTED;
+      }
+      return reg;
+    }
+
+    static INLINE GenRegister abs(GenRegister reg) {
+      reg.absolute = 1;
+      reg.negation = 0;
+      return reg;
+    }
+
+    /*! Generate register encoding with run-time simdWidth */
+#define DECL_REG_ENCODER(NAME, SIMD16, SIMD8, SIMD1) \
+    template <typename... Args> \
+    static INLINE GenRegister NAME(uint32_t simdWidth, Args... values) { \
+      if (simdWidth == 16) \
+        return SIMD16(values...); \
+      else if (simdWidth == 8) \
+        return SIMD8(values...); \
+      else if (simdWidth == 1) \
+        return SIMD1(values...); \
+      else { \
+        NOT_IMPLEMENTED; \
+        return SIMD1(values...); \
+      } \
+    }
+    DECL_REG_ENCODER(dfxgrf, df16grf, df8grf, df1grf);
+    DECL_REG_ENCODER(fxgrf, f16grf, f8grf, f1grf);
+    DECL_REG_ENCODER(uwxgrf, uw16grf, uw8grf, uw1grf);
+    DECL_REG_ENCODER(udxgrf, ud16grf, ud8grf, ud1grf);
+#undef DECL_REG_ENCODER
+  };
+} /* namespace gbe */
+
+#endif /* __GEN_REGISTER_HPP__ */
+
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
new file mode 100644
index 0000000..787d111
--- /dev/null
+++ b/backend/src/backend/program.cpp
@@ -0,0 +1,1317 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file callback interface for the compiler
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "program.h"
+#include "program.hpp"
+#include "gen_program.h"
+#include "sys/platform.hpp"
+#include "sys/cvar.hpp"
+#include "ir/liveness.hpp"
+#include "ir/value.hpp"
+#include "ir/unit.hpp"
+#include "ir/printf.hpp"
+#include "llvm/llvm_to_gen.hpp"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Threading.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IR/LLVMContext.h"
+#include <cstring>
+#include <algorithm>
+#include <fstream>
+#include <dlfcn.h>
+#include <sstream>
+#include <iostream>
+#include <unistd.h>
+#include <mutex>
+
+/* Not defined for LLVM 3.0 */
+#if !defined(LLVM_VERSION_MAJOR)
+#define LLVM_VERSION_MAJOR 3
+#endif /* !defined(LLVM_VERSION_MAJOR) */
+
+/* Not defined for LLVM 3.0 */
+#if !defined(LLVM_VERSION_MINOR)
+#define LLVM_VERSION_MINOR 0
+#endif /* !defined(LLVM_VERSION_MINOR) */
+
+#include <clang/CodeGen/CodeGenAction.h>
+#include <clang/Frontend/CompilerInstance.h>
+#include <clang/Frontend/CompilerInvocation.h>
+#if LLVM_VERSION_MINOR <= 1
+#include <clang/Frontend/DiagnosticOptions.h>
+#else
+#include <clang/Basic/DiagnosticOptions.h>
+#endif  /* LLVM_VERSION_MINOR <= 1 */
+#include <clang/Frontend/TextDiagnosticPrinter.h>
+#include <clang/Basic/TargetInfo.h>
+#include <clang/Basic/TargetOptions.h>
+#include <llvm/ADT/IntrusiveRefCntPtr.h>
+#if LLVM_VERSION_MINOR <= 2
+#include <llvm/Module.h>
+#else
+#include <llvm/IR/Module.h>
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Support/raw_ostream.h>
+#include "src/GBEConfig.h"
+
+namespace gbe {
+
+  Kernel::Kernel(const std::string &name) :
+    name(name), args(NULL), argNum(0), curbeSize(0), stackSize(0), useSLM(false),
+        slmSize(0), ctx(NULL), samplerSet(NULL), imageSet(NULL), printfSet(NULL) {}
+  Kernel::~Kernel(void) {
+    if(ctx) GBE_DELETE(ctx);
+    if(samplerSet) GBE_DELETE(samplerSet);
+    if(imageSet) GBE_DELETE(imageSet);
+    if(printfSet) GBE_DELETE(printfSet);
+    GBE_SAFE_DELETE_ARRAY(args);
+  }
+  int32_t Kernel::getCurbeOffset(gbe_curbe_type type, uint32_t subType) const {
+    const PatchInfo patch(type, subType);
+    const auto it = std::lower_bound(patches.begin(), patches.end(), patch);
+    if (it == patches.end()) return -1; // nothing found
+    if (patch < *it) return -1; // they are not equal
+    return it->offset; // we found it!
+  }
+
+  Program::Program(void) : constantSet(NULL) {}
+  Program::~Program(void) {
+    for (auto &kernel : kernels) GBE_DELETE(kernel.second);
+    if (constantSet) delete constantSet;
+  }
+
+#ifdef GBE_COMPILER_AVAILABLE
+  BVAR(OCL_OUTPUT_GEN_IR, false);
+
+  bool Program::buildFromLLVMFile(const char *fileName, const void* module, std::string &error, int optLevel) {
+    ir::Unit *unit = new ir::Unit();
+    llvm::Module * cloned_module = NULL;
+    if(module){
+      cloned_module = llvm::CloneModule((llvm::Module*)module);
+    }
+    if (llvmToGen(*unit, fileName, module, optLevel) == false) {
+      if (fileName)
+        error = std::string(fileName) + " not found";
+      delete unit;
+      return false;
+    }
+    //If unit is not valid, maybe some thing don't support by backend, introduce by some passes
+    //use optLevel 0 to try again.
+    if(!unit->getValid()) {
+      delete unit;   //clear unit
+      unit = new ir::Unit();
+      if(cloned_module){
+        llvmToGen(*unit, fileName, cloned_module, 0);  //suppose file exists and llvmToGen will not return false.
+      }else{
+        llvmToGen(*unit, fileName, module, 0);  //suppose file exists and llvmToGen will not return false.
+      }
+    }
+    assert(unit->getValid());
+    this->buildFromUnit(*unit, error);
+    delete unit;
+    if(cloned_module){
+      delete (llvm::Module*) cloned_module;
+    }
+    return true;
+  }
+
+  BVAR(OCL_STRICT_CONFORMANCE, false);
+
+  bool Program::buildFromUnit(const ir::Unit &unit, std::string &error) {
+    constantSet = new ir::ConstantSet(unit.getConstantSet());
+    const auto &set = unit.getFunctionSet();
+    const uint32_t kernelNum = set.size();
+    if (OCL_OUTPUT_GEN_IR) std::cout << unit;
+    if (kernelNum == 0) return true;
+    for (const auto &pair : set) {
+      const std::string &name = pair.first;
+      Kernel *kernel = this->compileKernel(unit, name, !OCL_STRICT_CONFORMANCE);
+      kernel->setSamplerSet(pair.second->getSamplerSet());
+      kernel->setImageSet(pair.second->getImageSet());
+      kernel->setPrintfSet(pair.second->getPrintfSet());
+      kernel->setCompileWorkGroupSize(pair.second->getCompileWorkGroupSize());
+      kernel->setFunctionAttributes(pair.second->getFunctionAttributes());
+      kernels.insert(std::make_pair(name, kernel));
+    }
+    return true;
+  }
+#endif
+
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+  size_t Program::serializeToBin(std::ostream& outs) {
+    size_t ret_size = 0;
+    size_t ker_num = kernels.size();
+    int has_constset = 0;
+
+    OUT_UPDATE_SZ(magic_begin);
+
+    if (constantSet) {
+      has_constset = 1;
+      OUT_UPDATE_SZ(has_constset);
+      size_t sz = constantSet->serializeToBin(outs);
+      if (!sz)
+        return 0;
+
+      ret_size += sz;
+    } else {
+      OUT_UPDATE_SZ(has_constset);
+    }
+
+    OUT_UPDATE_SZ(ker_num);
+    for (auto ker : kernels) {
+      size_t sz = ker.second->serializeToBin(outs);
+      if (!sz)
+        return 0;
+
+      ret_size += sz;
+    }
+
+    OUT_UPDATE_SZ(magic_end);
+
+    OUT_UPDATE_SZ(ret_size);
+    return ret_size;
+  }
+
+  size_t Program::deserializeFromBin(std::istream& ins) {
+    size_t total_size = 0;
+    int has_constset = 0;
+    size_t ker_num;
+    uint32_t magic;
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_begin)
+      return 0;
+
+    IN_UPDATE_SZ(has_constset);
+    if(has_constset) {
+      constantSet = new ir::ConstantSet;
+      size_t sz = constantSet->deserializeFromBin(ins);
+
+      if (sz == 0) {
+        return 0;
+      }
+
+      total_size += sz;
+    }
+
+    IN_UPDATE_SZ(ker_num);
+
+    for (size_t i = 0; i < ker_num; i++) {
+      size_t ker_serial_sz;
+      std::string ker_name; // Just a empty name here.
+      Kernel* ker = allocateKernel(ker_name);
+
+      if(!(ker_serial_sz = ker->deserializeFromBin(ins)))
+        return 0;
+
+      kernels.insert(std::make_pair(ker->getName(), ker));
+      total_size += ker_serial_sz;
+    }
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_end)
+      return 0;
+
+    size_t total_bytes;
+    IN_UPDATE_SZ(total_bytes);
+    if (total_bytes + sizeof(total_size) != total_size)
+      return 0;
+
+    return total_size;
+  }
+
+  size_t Kernel::serializeToBin(std::ostream& outs) {
+    unsigned int i;
+    size_t ret_size = 0;
+    int has_samplerset = 0;
+    int has_imageset = 0;
+
+    OUT_UPDATE_SZ(magic_begin);
+
+    OUT_UPDATE_SZ(name.size());
+    outs.write(name.c_str(), name.size());
+    ret_size += sizeof(char)*name.size();
+
+    OUT_UPDATE_SZ(argNum);
+    for (i = 0; i < argNum; i++) {
+      KernelArgument& arg = args[i];
+      OUT_UPDATE_SZ(arg.type);
+      OUT_UPDATE_SZ(arg.size);
+      OUT_UPDATE_SZ(arg.align);
+      OUT_UPDATE_SZ(arg.bti);
+    }
+
+    OUT_UPDATE_SZ(patches.size());
+    for (auto patch : patches) {
+      unsigned int tmp;
+      tmp = patch.type;
+      OUT_UPDATE_SZ(tmp);
+      tmp = patch.subType;
+      OUT_UPDATE_SZ(tmp);
+      tmp = patch.offset;
+      OUT_UPDATE_SZ(tmp);
+    }
+
+    OUT_UPDATE_SZ(curbeSize);
+    OUT_UPDATE_SZ(simdWidth);
+    OUT_UPDATE_SZ(stackSize);
+    OUT_UPDATE_SZ(scratchSize);
+    OUT_UPDATE_SZ(useSLM);
+    OUT_UPDATE_SZ(slmSize);
+    OUT_UPDATE_SZ(compileWgSize[0]);
+    OUT_UPDATE_SZ(compileWgSize[1]);
+    OUT_UPDATE_SZ(compileWgSize[2]);
+    /* samplers. */
+    if (!samplerSet->empty()) {   //samplerSet is always valid, allocated in Function::Function
+      has_samplerset = 1;
+      OUT_UPDATE_SZ(has_samplerset);
+      size_t sz = samplerSet->serializeToBin(outs);
+      if (!sz)
+        return 0;
+
+      ret_size += sz;
+    } else {
+      OUT_UPDATE_SZ(has_samplerset);
+    }
+
+    /* images. */
+    if (!imageSet->empty()) {   //imageSet is always valid, allocated in Function::Function
+      has_imageset = 1;
+      OUT_UPDATE_SZ(has_imageset);
+      size_t sz = imageSet->serializeToBin(outs);
+      if (!sz)
+        return 0;
+
+      ret_size += sz;
+    } else {
+      OUT_UPDATE_SZ(has_imageset);
+    }
+
+    /* Code. */
+    const char * code = getCode();
+    OUT_UPDATE_SZ(getCodeSize());
+    outs.write(code, getCodeSize()*sizeof(char));
+    ret_size += getCodeSize()*sizeof(char);
+
+    OUT_UPDATE_SZ(magic_end);
+
+    OUT_UPDATE_SZ(ret_size);
+    return ret_size;
+  }
+
+  size_t Kernel::deserializeFromBin(std::istream& ins) {
+    size_t total_size = 0;
+    int has_samplerset = 0;
+    int has_imageset = 0;
+    size_t code_size = 0;
+    uint32_t magic = 0;
+    size_t patch_num = 0;
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_begin)
+      return 0;
+
+    size_t name_len;
+    IN_UPDATE_SZ(name_len);
+    char* c_name = new char[name_len+1];
+    ins.read(c_name, name_len*sizeof(char));
+    total_size += sizeof(char)*name_len;
+    c_name[name_len] = 0;
+    name = c_name;
+    delete[] c_name;
+
+    IN_UPDATE_SZ(argNum);
+    args = GBE_NEW_ARRAY_NO_ARG(KernelArgument, argNum);
+    for (uint32_t i = 0; i < argNum; i++) {
+      KernelArgument& arg = args[i];
+      IN_UPDATE_SZ(arg.type);
+      IN_UPDATE_SZ(arg.size);
+      IN_UPDATE_SZ(arg.align);
+      IN_UPDATE_SZ(arg.bti);
+    }
+
+    IN_UPDATE_SZ(patch_num);
+    for (uint32_t i = 0; i < patch_num; i++) {
+      unsigned int tmp;
+      PatchInfo patch;
+      IN_UPDATE_SZ(tmp);
+      patch.type = tmp;
+      IN_UPDATE_SZ(tmp);
+      patch.subType = tmp;
+      IN_UPDATE_SZ(tmp);
+      patch.offset = tmp;
+
+      patches.push_back(patch);
+    }
+
+    IN_UPDATE_SZ(curbeSize);
+    IN_UPDATE_SZ(simdWidth);
+    IN_UPDATE_SZ(stackSize);
+    IN_UPDATE_SZ(scratchSize);
+    IN_UPDATE_SZ(useSLM);
+    IN_UPDATE_SZ(slmSize);
+    IN_UPDATE_SZ(compileWgSize[0]);
+    IN_UPDATE_SZ(compileWgSize[1]);
+    IN_UPDATE_SZ(compileWgSize[2]);
+
+    IN_UPDATE_SZ(has_samplerset);
+    if (has_samplerset) {
+      samplerSet = GBE_NEW(ir::SamplerSet);
+      size_t sz = samplerSet->deserializeFromBin(ins);
+      if (sz == 0) {
+        return 0;
+      }
+
+      total_size += sz;
+    }
+    else
+      samplerSet = NULL;
+
+    IN_UPDATE_SZ(has_imageset);
+    if (has_imageset) {
+      imageSet = GBE_NEW(ir::ImageSet);
+      size_t sz = imageSet->deserializeFromBin(ins);
+      if (sz == 0) {
+        return 0;
+      }
+
+      total_size += sz;
+    }
+    else
+      imageSet = NULL;
+
+    IN_UPDATE_SZ(code_size);
+    if (code_size) {
+      char* code = GBE_NEW_ARRAY_NO_ARG(char, code_size);
+      ins.read(code, code_size*sizeof(char));
+      total_size += sizeof(char)*code_size;
+      setCode(code, code_size);
+    }
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_end)
+      return 0;
+
+    size_t total_bytes;
+    IN_UPDATE_SZ(total_bytes);
+    if (total_bytes + sizeof(total_size) != total_size)
+      return 0;
+
+    return total_size;
+  }
+
+#undef OUT_UPDATE_SZ
+#undef IN_UPDATE_SZ
+
+  void Program::printStatus(int indent, std::ostream& outs) {
+    using namespace std;
+    string spaces = indent_to_str(indent);
+
+    outs << spaces << "=============== Begin Program ===============" << "\n";
+
+    if (constantSet) {
+      constantSet->printStatus(indent + 4, outs);
+    }
+
+    for (auto ker : kernels) {
+      ker.second->printStatus(indent + 4, outs);
+    }
+
+    outs << spaces << "================ End Program ================" << "\n";
+  }
+
+  void Kernel::printStatus(int indent, std::ostream& outs) {
+    using namespace std;
+    string spaces = indent_to_str(indent);
+    string spaces_nl = indent_to_str(indent + 4);
+    int num;
+
+    outs << spaces << "+++++++++++ Begin Kernel +++++++++++" << "\n";
+    outs << spaces_nl << "Kernel Name: " << name << "\n";
+    outs << spaces_nl << "  curbeSize: " << curbeSize << "\n";
+    outs << spaces_nl << "  simdWidth: " << simdWidth << "\n";
+    outs << spaces_nl << "  stackSize: " << stackSize << "\n";
+    outs << spaces_nl << "  scratchSize: " << scratchSize << "\n";
+    outs << spaces_nl << "  useSLM: " << useSLM << "\n";
+    outs << spaces_nl << "  slmSize: " << slmSize << "\n";
+    outs << spaces_nl << "  compileWgSize: " << compileWgSize[0] << compileWgSize[1] << compileWgSize[2] << "\n";
+
+    outs << spaces_nl << "  Argument Number is " << argNum << "\n";
+    for (uint32_t i = 0; i < argNum; i++) {
+      KernelArgument& arg = args[i];
+      outs << spaces_nl << "  Arg " << i << ":\n";
+      outs << spaces_nl << "      type value: "<< arg.type << "\n";
+      outs << spaces_nl << "      size: "<< arg.size << "\n";
+      outs << spaces_nl << "      align: "<< arg.align << "\n";
+      outs << spaces_nl << "      bti: "<< arg.bti << "\n";
+    }
+
+    outs << spaces_nl << "  Patches Number is " << patches.size() << "\n";
+    num = 0;
+    for (auto patch : patches) {
+      num++;
+      outs << spaces_nl << "  patch " << num << ":\n";
+      outs << spaces_nl << "      type value: "<< patch.type << "\n";
+      outs << spaces_nl << "      subtype value: "<< patch.subType << "\n";
+      outs << spaces_nl << "      offset: "<< patch.offset << "\n";
+    }
+
+    if (samplerSet) {
+      samplerSet->printStatus(indent + 4, outs);
+    }
+
+    if (imageSet) {
+      imageSet->printStatus(indent + 4, outs);
+    }
+
+    outs << spaces << "++++++++++++ End Kernel ++++++++++++" << "\n";
+  }
+
+  /*********************** End of Program class member function *************************/
+
+#define REDEF_MATH_FUNC(x) "#ifdef "#x"\n#undef "#x"\n#endif\n#define "#x" __gen_ocl_internal_fastpath_"#x"\n"
+  std::string ocl_mathfunc_fastpath_str =
+    REDEF_MATH_FUNC(acosh)
+    REDEF_MATH_FUNC(asinh)
+    REDEF_MATH_FUNC(atanh)
+    REDEF_MATH_FUNC(cbrt)
+    REDEF_MATH_FUNC(cos)
+    REDEF_MATH_FUNC(cosh)
+    REDEF_MATH_FUNC(cospi)
+    REDEF_MATH_FUNC(exp)
+    REDEF_MATH_FUNC(exp10)
+    REDEF_MATH_FUNC(expm1)
+    REDEF_MATH_FUNC(fmod)
+    REDEF_MATH_FUNC(hypot)
+    REDEF_MATH_FUNC(ilogb)
+    REDEF_MATH_FUNC(ldexp)
+    REDEF_MATH_FUNC(log)
+    REDEF_MATH_FUNC(log2)
+    REDEF_MATH_FUNC(log10)
+    REDEF_MATH_FUNC(log1p)
+    REDEF_MATH_FUNC(logb)
+    REDEF_MATH_FUNC(remainder)
+    REDEF_MATH_FUNC(rootn)
+    REDEF_MATH_FUNC(sin)
+    REDEF_MATH_FUNC(sincos)
+    REDEF_MATH_FUNC(sinh)
+    REDEF_MATH_FUNC(sinpi)
+    REDEF_MATH_FUNC(tan)
+    REDEF_MATH_FUNC(tanh)
+    "\n"
+  ;
+
+  static void programDelete(gbe_program gbeProgram) {
+    gbe::Program *program = (gbe::Program*)(gbeProgram);
+    GBE_SAFE_DELETE(program);
+  }
+
+  static void programCleanLlvmResource(gbe_program gbeProgram) {
+    gbe::Program *program = (gbe::Program*)(gbeProgram);
+    program->CleanLlvmResource();
+  }
+
+#ifdef GBE_COMPILER_AVAILABLE
+  BVAR(OCL_OUTPUT_BUILD_LOG, false);
+  SVAR(OCL_PCH_PATH, PCH_OBJECT_DIR);
+  SVAR(OCL_PCM_PATH, PCM_OBJECT_DIR);
+
+  static bool buildModuleFromSource(const char* input, llvm::Module** out_module, llvm::LLVMContext* llvm_ctx, std::string options,
+                                    size_t stringSize, char *err, size_t *errSize) {
+    // Arguments to pass to the clang frontend
+    vector<const char *> args;
+    bool bFastMath = false;
+
+    vector<std::string> useless; //hold substrings to avoid c_str free
+    size_t start = 0, end = 0;
+    /* FIXME
+       clang unsupport options:
+       -cl-denorms-are-zero, -cl-strict-aliasing
+       -cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt
+       all support options, refer to clang/include/clang/Driver/Options.inc
+    */
+    //Handle -cl-opt-disable in llvmToGen, skip here
+    const std::string unsupportedOptions("-cl-denorms-are-zero, -cl-strict-aliasing, -cl-opt-disable,"
+                                         "-cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt");
+    bool useDefaultCLCVersion = true;
+    while (end != std::string::npos) {
+      end = options.find(' ', start);
+      std::string str = options.substr(start, end - start);
+      start = end + 1;
+      if(str.size() == 0)
+        continue;
+      if(str == "-cl-fast-relaxed-math") bFastMath = true;
+      if(unsupportedOptions.find(str) != std::string::npos)
+        continue;
+      if(str.find("-cl-std=") != std::string::npos) {
+        useDefaultCLCVersion = false;
+        if (str == "-cl-std=CL1.1")
+          args.push_back("-D__OPENCL_C_VERSION__=110");
+        else if (str == "-cl-std=CL1.2")
+          args.push_back("-D__OPENCL_C_VERSION__=120");
+        else {
+          if (err && stringSize > 0 && errSize)
+            *errSize = snprintf(err, stringSize, "Invalid build option: %s\n", str.c_str());
+          return false;
+        }
+      }
+      useless.push_back(str);
+      args.push_back(str.c_str());
+    }
+    if (useDefaultCLCVersion) {
+      args.push_back("-D__OPENCL_C_VERSION__=120");
+      args.push_back("-cl-std=CL1.2");
+    }
+    args.push_back("-mllvm");
+    args.push_back("-inline-threshold=200000");
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+    args.push_back("-DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND");
+#endif
+    args.push_back("-emit-llvm");
+    // FIXME we haven't implement those builtin functions,
+    // so disable it currently.
+    args.push_back("-fno-builtin");
+    args.push_back("-disable-llvm-optzns");
+    if(bFastMath)
+      args.push_back("-D __FAST_RELAXED_MATH__=1");
+#if LLVM_VERSION_MINOR <= 2
+    args.push_back("-triple");
+    args.push_back("nvptx");
+#else
+    args.push_back("-x");
+    args.push_back("cl");
+    args.push_back("-triple");
+    args.push_back("spir");
+#endif /* LLVM_VERSION_MINOR <= 2 */
+    args.push_back(input);
+
+    // The compiler invocation needs a DiagnosticsEngine so it can report problems
+    std::string ErrorString;
+    llvm::raw_string_ostream ErrorInfo(ErrorString);
+    llvm::IntrusiveRefCntPtr<clang::DiagnosticOptions> DiagOpts = new clang::DiagnosticOptions();
+    DiagOpts->ShowCarets = false;
+    DiagOpts->ShowPresumedLoc = true;
+#if LLVM_VERSION_MINOR <= 1
+    args.push_back("-triple");
+    args.push_back("ptx32");
+
+    clang::TextDiagnosticPrinter *DiagClient =
+                             new clang::TextDiagnosticPrinter(ErrorInfo, *DiagOpts)
+    llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> DiagID(new clang::DiagnosticIDs());
+    clang::DiagnosticsEngine Diags(DiagID, DiagClient);
+#else
+    args.push_back("-ffp-contract=off");
+
+    clang::TextDiagnosticPrinter *DiagClient =
+                             new clang::TextDiagnosticPrinter(ErrorInfo, &*DiagOpts);
+    llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> DiagID(new clang::DiagnosticIDs());
+    clang::DiagnosticsEngine Diags(DiagID, &*DiagOpts, DiagClient);
+#endif /* LLVM_VERSION_MINOR <= 1 */
+    // Create the compiler invocation
+    std::unique_ptr<clang::CompilerInvocation> CI(new clang::CompilerInvocation);
+    clang::CompilerInvocation::CreateFromArgs(*CI,
+                                              &args[0],
+                                              &args[0] + args.size(),
+                                              Diags);
+
+    // Create the compiler instance
+    clang::CompilerInstance Clang;
+    Clang.setInvocation(CI.release());
+    // Get ready to report problems
+#if LLVM_VERSION_MINOR <= 2
+    Clang.createDiagnostics(args.size(), &args[0]);
+#else
+    Clang.createDiagnostics(DiagClient, false);
+#endif /* LLVM_VERSION_MINOR <= 2 */
+
+    Clang.getDiagnosticOpts().ShowCarets = false;
+    if (!Clang.hasDiagnostics())
+      return false;
+
+    // Set Language
+    clang::LangOptions & lang_opts = Clang.getLangOpts();
+    lang_opts.OpenCL = 1;
+
+    clang::PreprocessorOptions& prep_opt = Clang.getPreprocessorOpts();
+    prep_opt.DisablePCHValidation = 1;
+
+    //llvm flags need command line parsing to take effect
+    if (!Clang.getFrontendOpts().LLVMArgs.empty()) {
+      unsigned NumArgs = Clang.getFrontendOpts().LLVMArgs.size();
+      const char **Args = new const char*[NumArgs + 2];
+      Args[0] = "clang (LLVM option parsing)";
+      for (unsigned i = 0; i != NumArgs; ++i){
+        Args[i + 1] = Clang.getFrontendOpts().LLVMArgs[i].c_str();
+      }
+      Args[NumArgs + 1] = 0;
+      llvm::cl::ParseCommandLineOptions(NumArgs + 1, Args);
+      delete [] Args;
+    }
+
+    // Create an action and make the compiler instance carry it out
+    std::unique_ptr<clang::CodeGenAction> Act(new clang::EmitLLVMOnlyAction(llvm_ctx));
+
+    std::string dirs = OCL_PCM_PATH;
+    std::string pcmFileName;
+    std::istringstream idirs(dirs);
+    bool findPcm = false;
+
+    while (getline(idirs, pcmFileName, ':')) {
+      if(access(pcmFileName.c_str(), R_OK) == 0) {
+        findPcm |= true;
+        break;
+      }
+    }
+
+    GBE_ASSERT(findPcm && "Could not find pre compiled module library.\n");
+
+    Clang.getCodeGenOpts().LinkBitcodeFile = pcmFileName;
+    auto retVal = Clang.ExecuteAction(*Act);
+
+    if (err != NULL) {
+      GBE_ASSERT(errSize != NULL);
+      *errSize = ErrorString.copy(err, stringSize - 1, 0);
+    }
+
+    if (err == NULL || OCL_OUTPUT_BUILD_LOG) {
+      // flush the error messages to the errs() if there is no
+      // error string buffer.
+      llvm::errs() << ErrorString;
+    }
+    ErrorString.clear();
+    if (!retVal)
+      return false;
+
+    llvm::Module *module = Act->takeModule();
+
+    *out_module = module;
+    return true;
+  }
+
+  extern std::string ocl_stdlib_str;
+
+  BVAR(OCL_USE_PCH, true);
+  static void processSourceAndOption(const char *source,
+                                     const char *options,
+                                     const char *temp_header_path,
+                                     std::string& clOpt,
+                                     std::string& clName,
+                                     int& optLevel)
+  {
+    char clStr[] = "/tmp/XXXXXX.cl";
+    int clFd = mkstemps(clStr, 3);
+    clName = std::string(clStr);
+
+    FILE *clFile = fdopen(clFd, "w");
+    FATAL_IF(clFile == NULL, "Failed to open temporary file");
+
+    bool usePCH = OCL_USE_PCH;
+    bool findPCH = false;
+
+    /* Because our header file is so big, we want to avoid recompile the header from
+       scratch. We use the PCH support of Clang to save the huge compiling time.
+       We just use the most general build opt to build the PCH header file, so if
+       user pass new build options here, the PCH can not pass the Clang's compitable
+       validating. Clang will do three kinds of compatible check: Language Option,
+       Target Option and Preprocessing Option. Other kinds of options such as the
+       CodeGen options will not affect the AST result, so no need to check.
+
+       According to OpenCL 1.1's spec, the CL build options:
+       -D name=definition
+       If the definition is not used in our header, it is compitable
+
+       -cl-single-precision-constant
+       -cl-denorms-are-zero
+       -cl-std=
+       Language options, really affect.
+
+       -cl-opt-disable
+       -cl-mad-enable
+       -cl-no-signed-zeros
+       -cl-unsafe-math-optimizations
+       -cl-finite-math-only
+       -cl-fast-relaxed-math
+       CodeGen options, not affect
+
+       -Werror
+       -w
+       Our header should not block the compiling because of warning.
+
+       So we just disable the PCH validation of Clang and do the judgement by ourself. */
+
+    /* We always add -cl-kernel-arg-info to the options. This option just generate the arg
+       information for the backend, no other side effect and does not have performance issue. */
+    if (!options || !strstr(const_cast<char *>(options), "-cl-kernel-arg-info"))
+      clOpt += "-cl-kernel-arg-info ";
+
+    if (options) {
+      char *p;
+      /* FIXME: Though we can disable the pch valid check, and load pch successfully,
+         but these language opts and pre-defined macro will still generate the diag msg
+         to the diag engine of the Clang and cause the Clang to report error.
+         We filter them all here to avoid these. */
+      const char * incompatible_opts[] = {
+          "-cl-single-precision-constant",
+//        "-cl-denorms-are-zero",
+          "-cl-fast-relaxed-math",
+          "-cl-std=CL1.1"
+      };
+      const char * incompatible_defs[] = {
+          "GET_FLOAT_WORD",
+          "__NV_CL_C_VERSION",
+          "GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND"
+      };
+
+      for (unsigned int i = 0; i < sizeof(incompatible_opts)/sizeof(char *); i++ ) {
+        p = strstr(const_cast<char *>(options), incompatible_opts[i]);
+        if (p) {
+          usePCH = false;
+          break;
+        }
+      }
+
+      if (usePCH) {
+        for (unsigned int i = 0; i < sizeof(incompatible_defs)/sizeof(char *); i++ ) {
+          p = strstr(const_cast<char *>(options), incompatible_defs[i]);
+          if (p) {
+            usePCH = false;
+            break;
+          }
+        }
+      }
+
+      p = strstr(const_cast<char *>(options), "-cl-opt-disable");
+      if (p)
+        optLevel = 0;
+      // XXX enable cl_khr_fp64 may cause some potential bugs.
+      // we may need to revisit here latter when we want to support fp64 completely.
+      // For now, as we don't support fp64 actually, just disable it by default.
+#if 0
+      #define ENABLE_CL_KHR_FP64_STR "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+      if (!strstr(const_cast<char *>(options), "-cl-std=CL1.1"))
+        fwrite(ENABLE_CL_KHR_FP64_STR, strlen(ENABLE_CL_KHR_FP64_STR), 1, clFile);
+#endif
+
+      clOpt += options;
+    }
+
+    std::string dirs = OCL_PCH_PATH;
+    std::istringstream idirs(dirs);
+    std::string pchFileName;
+
+    while (getline(idirs, pchFileName, ':')) {
+      if(access(pchFileName.c_str(), R_OK) == 0) {
+        findPCH = true;
+        break;
+      }
+    }
+
+    if (usePCH && findPCH) {
+      clOpt += " -include-pch ";
+      clOpt += pchFileName;
+      clOpt += " ";
+    } else
+      fwrite(ocl_stdlib_str.c_str(), strlen(ocl_stdlib_str.c_str()), 1, clFile);
+
+    //for clCompilerProgram usage.
+    if(temp_header_path){
+      clOpt += " -I ";
+      clOpt += temp_header_path;
+      clOpt += " ";
+    }
+
+    if (!OCL_STRICT_CONFORMANCE) {
+        fwrite(ocl_mathfunc_fastpath_str.c_str(), strlen(ocl_mathfunc_fastpath_str.c_str()), 1, clFile);
+    }
+
+    // reset the file number in case we have inserted something into the kernel
+    std::string resetFileNum = "#line 1\n";
+    fwrite(resetFileNum.c_str(), strlen(resetFileNum.c_str()), 1, clFile);
+
+    // Write the source to the cl file
+    fwrite(source, strlen(source), 1, clFile);
+    fclose(clFile);
+  }
+
+  static gbe_program programNewFromSource(uint32_t deviceID,
+                                          const char *source,
+                                          size_t stringSize,
+                                          const char *options,
+                                          char *err,
+                                          size_t *errSize)
+  {
+    int optLevel = 1;
+    std::string clOpt;
+    std::string clName;
+    processSourceAndOption(source, options, NULL, clOpt, clName, optLevel);
+
+    gbe_program p;
+    // will delete the module and act in GenProgram::CleanLlvmResource().
+    llvm::Module * out_module;
+    llvm::LLVMContext* llvm_ctx = new llvm::LLVMContext;
+
+    static std::mutex llvm_mutex;
+    if (!llvm::llvm_is_multithreaded())
+      llvm_mutex.lock();
+
+    if (buildModuleFromSource(clName.c_str(), &out_module, llvm_ctx, clOpt.c_str(),
+                              stringSize, err, errSize)) {
+    // Now build the program from llvm
+      size_t clangErrSize = 0;
+      if (err != NULL) {
+        GBE_ASSERT(errSize != NULL);
+        stringSize -= *errSize;
+        err += *errSize;
+        clangErrSize = *errSize;
+      }
+
+      p = gbe_program_new_from_llvm(deviceID, NULL, out_module, llvm_ctx, stringSize,
+                                    err, errSize, optLevel);
+      if (err != NULL)
+        *errSize += clangErrSize;
+      if (OCL_OUTPUT_BUILD_LOG && options)
+        llvm::errs() << options;
+    } else
+      p = NULL;
+
+    if (!llvm::llvm_is_multithreaded())
+      llvm_mutex.unlock();
+
+    remove(clName.c_str());
+    return p;
+  }
+#endif
+
+#ifdef GBE_COMPILER_AVAILABLE
+
+  static gbe_program programCompileFromSource(uint32_t deviceID,
+                                          const char *source,
+                                          const char *temp_header_path,
+                                          size_t stringSize,
+                                          const char *options,
+                                          char *err,
+                                          size_t *errSize)
+  {
+    int optLevel = 1;
+    std::string clOpt;
+    std::string clName;
+    processSourceAndOption(source, options, temp_header_path, clOpt, clName, optLevel);
+
+    gbe_program p;
+    acquireLLVMContextLock();
+    //FIXME: if use new allocated context to link two modules there would be context mismatch
+    //for some functions, so we use global context now, need switch to new context later.
+    llvm::Module * out_module;
+    llvm::LLVMContext* llvm_ctx = &llvm::getGlobalContext();
+    if (buildModuleFromSource(clName.c_str(), &out_module, llvm_ctx, clOpt.c_str(),
+                              stringSize, err, errSize)) {
+    // Now build the program from llvm
+      if (err != NULL) {
+        GBE_ASSERT(errSize != NULL);
+        stringSize -= *errSize;
+        err += *errSize;
+      }
+
+      p = gbe_program_new_gen_program(deviceID, out_module, NULL);
+
+      if (OCL_OUTPUT_BUILD_LOG && options)
+        llvm::errs() << options;
+    } else
+      p = NULL;
+    remove(clName.c_str());
+    releaseLLVMContextLock();
+    return p;
+  }
+#endif
+
+#ifdef GBE_COMPILER_AVAILABLE
+  static void programLinkProgram(gbe_program           dst_program,
+                                 gbe_program           src_program,
+                                 size_t                stringSize,
+                                 char *                err,
+                                 size_t *              errSize)
+  {
+    acquireLLVMContextLock();
+
+    gbe_program_link_from_llvm(dst_program, src_program, stringSize, err, errSize);
+
+    releaseLLVMContextLock();
+
+    if (OCL_OUTPUT_BUILD_LOG && err)
+      llvm::errs() << err;
+  }
+#endif
+
+  static size_t programGetGlobalConstantSize(gbe_program gbeProgram) {
+    if (gbeProgram == NULL) return 0;
+    const gbe::Program *program = (const gbe::Program*) gbeProgram;
+    return program->getGlobalConstantSize();
+  }
+
+  static void programGetGlobalConstantData(gbe_program gbeProgram, char *mem) {
+    if (gbeProgram == NULL) return;
+    const gbe::Program *program = (const gbe::Program*) gbeProgram;
+    program->getGlobalConstantData(mem);
+  }
+
+  static uint32_t programGetKernelNum(gbe_program gbeProgram) {
+    if (gbeProgram == NULL) return 0;
+    const gbe::Program *program = (const gbe::Program*) gbeProgram;
+    return program->getKernelNum();
+  }
+
+  static gbe_kernel programGetKernelByName(gbe_program gbeProgram, const char *name) {
+    if (gbeProgram == NULL) return NULL;
+    const gbe::Program *program = (gbe::Program*) gbeProgram;
+    return (gbe_kernel) program->getKernel(std::string(name));
+  }
+
+  static gbe_kernel programGetKernel(const gbe_program gbeProgram, uint32_t ID) {
+    if (gbeProgram == NULL) return NULL;
+    const gbe::Program *program = (gbe::Program*) gbeProgram;
+    return (gbe_kernel) program->getKernel(ID);
+  }
+
+  static const char *kernelGetName(gbe_kernel genKernel) {
+    if (genKernel == NULL) return NULL;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getName();
+  }
+
+  static const char *kernelGetAttributes(gbe_kernel genKernel) {
+    if (genKernel == NULL) return NULL;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getFunctionAttributes();
+  }
+
+  static const char *kernelGetCode(gbe_kernel genKernel) {
+    if (genKernel == NULL) return NULL;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getCode();
+  }
+
+  static size_t kernelGetCodeSize(gbe_kernel genKernel) {
+    if (genKernel == NULL) return 0u;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getCodeSize();
+  }
+
+  static uint32_t kernelGetArgNum(gbe_kernel genKernel) {
+    if (genKernel == NULL) return 0u;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getArgNum();
+  }
+
+  static void *kernelGetArgInfo(gbe_kernel genKernel, uint32_t argID, uint32_t value) {
+    if (genKernel == NULL) return NULL;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    ir::FunctionArgument::InfoFromLLVM* info = kernel->getArgInfo(argID);
+
+    switch (value) {
+      case GBE_GET_ARG_INFO_ADDRSPACE:
+        return (void*)((unsigned long)info->addrSpace);
+      case GBE_GET_ARG_INFO_TYPE:
+        return (void *)(info->typeName.c_str());
+      case GBE_GET_ARG_INFO_ACCESS:
+        return (void *)(info->accessQual.c_str());
+      case GBE_GET_ARG_INFO_TYPEQUAL:
+        return (void *)(info->typeQual.c_str());
+      case GBE_GET_ARG_INFO_NAME:
+        return (void *)(info->argName.c_str());
+      default:
+        assert(0);
+    }
+
+    return NULL;
+  }
+
+  static uint32_t kernelGetArgSize(gbe_kernel genKernel, uint32_t argID) {
+    if (genKernel == NULL) return 0u;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getArgSize(argID);
+  }
+
+  static uint8_t kernelGetArgBTI(gbe_kernel genKernel, uint32_t argID) {
+    if (genKernel == NULL) return 0u;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getArgBTI(argID);
+  }
+
+  static uint32_t kernelGetArgAlign(gbe_kernel genKernel, uint32_t argID) {
+    if (genKernel == NULL) return 0u;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getArgAlign(argID);
+  }
+  static gbe_arg_type kernelGetArgType(gbe_kernel genKernel, uint32_t argID) {
+    if (genKernel == NULL) return GBE_ARG_INVALID;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getArgType(argID);
+  }
+
+  static uint32_t kernelGetSIMDWidth(gbe_kernel genKernel) {
+    if (genKernel == NULL) return GBE_ARG_INVALID;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getSIMDWidth();
+  }
+
+  static int32_t kernelGetCurbeOffset(gbe_kernel genKernel, gbe_curbe_type type, uint32_t subType) {
+    if (genKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getCurbeOffset(type, subType);
+  }
+
+  static int32_t kernelGetCurbeSize(gbe_kernel genKernel) {
+    if (genKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getCurbeSize();
+  }
+
+  static int32_t kernelGetStackSize(gbe_kernel genKernel) {
+    if (genKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getStackSize();
+  }
+
+  static int32_t kernelGetScratchSize(gbe_kernel genKernel) {
+    if (genKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getScratchSize();
+  }
+
+  static int32_t kernelUseSLM(gbe_kernel genKernel) {
+    if (genKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getUseSLM() ? 1 : 0;
+  }
+
+  static int32_t kernelGetSLMSize(gbe_kernel genKernel) {
+    if (genKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) genKernel;
+    return kernel->getSLMSize();
+  }
+
+  static size_t kernelGetSamplerSize(gbe_kernel gbeKernel) {
+    if (gbeKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    return kernel->getSamplerSize();
+  }
+
+  static void kernelGetSamplerData(gbe_kernel gbeKernel, uint32_t *samplers) {
+    if (gbeKernel == NULL) return;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    kernel->getSamplerData(samplers);
+  }
+
+  static uint32_t kernelGetPrintfNum(void * printf_info) {
+    if (printf_info == NULL) return 0;
+    const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+    return ps->getPrintfNum();
+  }
+
+  static void* kernelDupPrintfSet(gbe_kernel gbeKernel) {
+    if (gbeKernel == NULL) return NULL;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    return kernel->dupPrintfSet();
+  }
+
+  static uint8_t kernelGetPrintfBufBTI(void * printf_info) {
+    if (printf_info == NULL) return 0;
+    const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+    return ps->getBufBTI();
+  }
+
+  static uint8_t kernelGetPrintfIndexBufBTI(void * printf_info) {
+    if (printf_info == NULL) return 0;
+    const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+    return ps->getIndexBufBTI();
+  }
+
+  static void kernelReleasePrintfSet(void * printf_info) {
+    if (printf_info == NULL) return;
+    ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+    delete ps;
+  }
+
+  static uint32_t kernelGetPrintfSizeOfSize(void * printf_info) {
+    if (printf_info == NULL) return 0;
+    const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+    return ps->getPrintfSizeOfSize();
+  }
+
+  static void kernelOutputPrintf(void * printf_info, void* index_addr,
+                                 void* buf_addr, size_t global_wk_sz0,
+                                 size_t global_wk_sz1, size_t global_wk_sz2)
+  {
+    if (printf_info == NULL) return;
+    ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
+    ps->outputPrintf(index_addr, buf_addr, global_wk_sz0,
+                         global_wk_sz1, global_wk_sz2);
+  }
+
+  static void kernelGetCompileWorkGroupSize(gbe_kernel gbeKernel, size_t wg_size[3]) {
+    if (gbeKernel == NULL) return;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    kernel->getCompileWorkGroupSize(wg_size);
+  }
+
+  static size_t kernelGetImageSize(gbe_kernel gbeKernel) {
+    if (gbeKernel == NULL) return 0;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    return kernel->getImageSize();
+  }
+
+  static void kernelGetImageData(gbe_kernel gbeKernel, ImageInfo *images) {
+    if (gbeKernel == NULL) return;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    kernel->getImageData(images);
+  }
+
+  static uint32_t kernelGetRequiredWorkGroupSize(gbe_kernel kernel, uint32_t dim) {
+    return 0u;
+  }
+} /* namespace gbe */
+
+std::mutex llvm_ctx_mutex;
+void acquireLLVMContextLock()
+{
+  llvm_ctx_mutex.lock();
+}
+
+void releaseLLVMContextLock()
+{
+  llvm_ctx_mutex.unlock();
+}
+
+GBE_EXPORT_SYMBOL gbe_program_new_from_source_cb *gbe_program_new_from_source = NULL;
+GBE_EXPORT_SYMBOL gbe_program_compile_from_source_cb *gbe_program_compile_from_source = NULL;
+GBE_EXPORT_SYMBOL gbe_program_link_program_cb *gbe_program_link_program = NULL;
+GBE_EXPORT_SYMBOL gbe_program_new_from_binary_cb *gbe_program_new_from_binary = NULL;
+GBE_EXPORT_SYMBOL gbe_program_new_from_llvm_binary_cb *gbe_program_new_from_llvm_binary = NULL;
+GBE_EXPORT_SYMBOL gbe_program_serialize_to_binary_cb *gbe_program_serialize_to_binary = NULL;
+GBE_EXPORT_SYMBOL gbe_program_new_from_llvm_cb *gbe_program_new_from_llvm = NULL;
+GBE_EXPORT_SYMBOL gbe_program_new_gen_program_cb *gbe_program_new_gen_program = NULL;
+GBE_EXPORT_SYMBOL gbe_program_link_from_llvm_cb *gbe_program_link_from_llvm = NULL;
+GBE_EXPORT_SYMBOL gbe_program_build_from_llvm_cb *gbe_program_build_from_llvm = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_size = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data = NULL;
+GBE_EXPORT_SYMBOL gbe_program_clean_llvm_resource_cb *gbe_program_clean_llvm_resource = NULL;
+GBE_EXPORT_SYMBOL gbe_program_delete_cb *gbe_program_delete = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_kernel_num_cb *gbe_program_get_kernel_num = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_kernel_by_name_cb *gbe_program_get_kernel_by_name = NULL;
+GBE_EXPORT_SYMBOL gbe_program_get_kernel_cb *gbe_program_get_kernel = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_name_cb *gbe_kernel_get_name = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_attributes_cb *gbe_kernel_get_attributes = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_code_cb *gbe_kernel_get_code = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_code_size_cb *gbe_kernel_get_code_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_num_cb *gbe_kernel_get_arg_num = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_info_cb *gbe_kernel_get_arg_info = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_bti_cb *gbe_kernel_get_arg_bti = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_arg_align_cb *gbe_kernel_get_arg_align = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = NULL;
+GBE_EXPORT_SYMBOL gbe_get_printf_num_cb *gbe_get_printf_num = NULL;
+GBE_EXPORT_SYMBOL gbe_dup_printfset_cb *gbe_dup_printfset = NULL;
+GBE_EXPORT_SYMBOL gbe_get_printf_buf_bti_cb *gbe_get_printf_buf_bti = NULL;
+GBE_EXPORT_SYMBOL gbe_get_printf_indexbuf_bti_cb *gbe_get_printf_indexbuf_bti = NULL;
+GBE_EXPORT_SYMBOL gbe_release_printf_info_cb *gbe_release_printf_info = NULL;
+GBE_EXPORT_SYMBOL gbe_get_printf_sizeof_size_cb *gbe_get_printf_sizeof_size = NULL;
+GBE_EXPORT_SYMBOL gbe_output_printf_cb *gbe_output_printf = NULL;
+
+#ifdef GBE_COMPILER_AVAILABLE
+namespace gbe
+{
+  /* Use pre-main to setup the call backs */
+  struct CallBackInitializer
+  {
+    CallBackInitializer(void) {
+      gbe_program_new_from_source = gbe::programNewFromSource;
+      gbe_program_compile_from_source = gbe::programCompileFromSource;
+      gbe_program_link_program = gbe::programLinkProgram;
+      gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
+      gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
+      gbe_program_clean_llvm_resource = gbe::programCleanLlvmResource;
+      gbe_program_delete = gbe::programDelete;
+      gbe_program_get_kernel_num = gbe::programGetKernelNum;
+      gbe_program_get_kernel_by_name = gbe::programGetKernelByName;
+      gbe_program_get_kernel = gbe::programGetKernel;
+      gbe_kernel_get_name = gbe::kernelGetName;
+      gbe_kernel_get_attributes = gbe::kernelGetAttributes;
+      gbe_kernel_get_code = gbe::kernelGetCode;
+      gbe_kernel_get_code_size = gbe::kernelGetCodeSize;
+      gbe_kernel_get_arg_num = gbe::kernelGetArgNum;
+      gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
+      gbe_kernel_get_arg_size = gbe::kernelGetArgSize;
+      gbe_kernel_get_arg_bti = gbe::kernelGetArgBTI;
+      gbe_kernel_get_arg_type = gbe::kernelGetArgType;
+      gbe_kernel_get_arg_align = gbe::kernelGetArgAlign;
+      gbe_kernel_get_simd_width = gbe::kernelGetSIMDWidth;
+      gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
+      gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
+      gbe_kernel_get_stack_size = gbe::kernelGetStackSize;
+      gbe_kernel_get_scratch_size = gbe::kernelGetScratchSize;
+      gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
+      gbe_kernel_use_slm = gbe::kernelUseSLM;
+      gbe_kernel_get_slm_size = gbe::kernelGetSLMSize;
+      gbe_kernel_get_sampler_size = gbe::kernelGetSamplerSize;
+      gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
+      gbe_kernel_get_compile_wg_size = gbe::kernelGetCompileWorkGroupSize;
+      gbe_kernel_get_image_size = gbe::kernelGetImageSize;
+      gbe_kernel_get_image_data = gbe::kernelGetImageData;
+      gbe_get_printf_num = gbe::kernelGetPrintfNum;
+      gbe_get_printf_buf_bti = gbe::kernelGetPrintfBufBTI;
+      gbe_get_printf_indexbuf_bti = gbe::kernelGetPrintfIndexBufBTI;
+      gbe_dup_printfset = gbe::kernelDupPrintfSet;
+      gbe_get_printf_sizeof_size = gbe::kernelGetPrintfSizeOfSize;
+      gbe_release_printf_info = gbe::kernelReleasePrintfSet;
+      gbe_output_printf = gbe::kernelOutputPrintf;
+      genSetupCallBacks();
+    }
+
+    ~CallBackInitializer() {
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR > 3)
+      llvm::llvm_shutdown();
+#endif
+    }
+  };
+
+  static CallBackInitializer cbInitializer;
+} /* namespace gbe */
+#endif
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
new file mode 100644
index 0000000..1421993
--- /dev/null
+++ b/backend/src/backend/program.h
@@ -0,0 +1,358 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file program.h
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * C interface for the Gen kernels and programs (either real Gen ISA or Gen
+ * simulator). This is the only thing the run-time can see from the compiler
+ */
+
+#ifndef __GBE_PROGRAM_H__
+#define __GBE_PROGRAM_H__
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/*! Opaque structure that interfaces a GBE program */
+typedef struct _gbe_program *gbe_program;
+
+/*! Opaque structure that interfaces a GBE kernel (ie one OCL function) */
+typedef struct _gbe_kernel *gbe_kernel;
+
+/*! Argument type for each function call */
+enum gbe_arg_type {
+  GBE_ARG_VALUE = 0,            // int, float and so on
+  GBE_ARG_GLOBAL_PTR = 1,       // __global
+  GBE_ARG_CONSTANT_PTR = 2,     // __constant
+  GBE_ARG_LOCAL_PTR = 3,        // __local
+  GBE_ARG_IMAGE = 4,            // image2d_t, image3d_t
+  GBE_ARG_SAMPLER = 5,          // sampler_t
+  GBE_ARG_INVALID = 0xffffffff
+};
+
+/*! Get argument info values */
+enum gbe_get_arg_info_value {
+  GBE_GET_ARG_INFO_ADDRSPACE = 0,
+  GBE_GET_ARG_INFO_ACCESS = 1,
+  GBE_GET_ARG_INFO_TYPE = 2,
+  GBE_GET_ARG_INFO_TYPEQUAL = 3,
+  GBE_GET_ARG_INFO_NAME = 4,
+  GBE_GET_ARG_INFO_INVALID = 0xffffffff
+};
+
+// BTI magic number
+#define BTI_CONSTANT 0
+#define BTI_PRIVATE 1
+#define BTI_RESERVED_NUM 2
+#define BTI_MAX_IMAGE_NUM 128
+#define BTI_MAX_ID (BTI_MAX_IMAGE_NUM + BTI_RESERVED_NUM - 1)
+
+/*! Constant buffer values (ie values to setup in the constant buffer) */
+enum gbe_curbe_type {
+  GBE_CURBE_LOCAL_ID_X = 0,
+  GBE_CURBE_LOCAL_ID_Y,
+  GBE_CURBE_LOCAL_ID_Z,
+  GBE_CURBE_LOCAL_SIZE_X,
+  GBE_CURBE_LOCAL_SIZE_Y,
+  GBE_CURBE_LOCAL_SIZE_Z,
+  GBE_CURBE_GLOBAL_SIZE_X,
+  GBE_CURBE_GLOBAL_SIZE_Y,
+  GBE_CURBE_GLOBAL_SIZE_Z,
+  GBE_CURBE_GLOBAL_OFFSET_X,
+  GBE_CURBE_GLOBAL_OFFSET_Y,
+  GBE_CURBE_GLOBAL_OFFSET_Z,
+  GBE_CURBE_GROUP_NUM_X,
+  GBE_CURBE_GROUP_NUM_Y,
+  GBE_CURBE_GROUP_NUM_Z,
+  GBE_CURBE_WORK_DIM,
+  GBE_CURBE_IMAGE_INFO,
+  GBE_CURBE_STACK_POINTER,
+  GBE_CURBE_PRINTF_BUF_POINTER,
+  GBE_CURBE_PRINTF_INDEX_POINTER,
+  GBE_CURBE_KERNEL_ARGUMENT,
+  GBE_CURBE_EXTRA_ARGUMENT,
+  GBE_CURBE_BLOCK_IP,
+  GBE_CURBE_THREAD_NUM,
+  GBE_CURBE_ZERO,
+  GBE_CURBE_ONE,
+  GBE_CURBE_SLM_OFFSET,
+};
+
+/*! Extra arguments use the negative range of sub-values */
+enum gbe_extra_argument {
+  GBE_STACK_BUFFER = 0,   /* Give stack location in curbe */
+  GBE_CONSTANT_BUFFER = 1 /* constant buffer argument location in curbe */
+};
+
+typedef struct ImageInfo {
+    int32_t arg_idx;
+    int32_t idx;
+    int32_t wSlot;
+    int32_t hSlot;
+    int32_t depthSlot;
+    int32_t dataTypeSlot;
+    int32_t channelOrderSlot;
+    int32_t dimOrderSlot;
+} ImageInfo;
+
+typedef void (gbe_set_image_base_index_cb)(uint32_t base_idx);
+extern gbe_set_image_base_index_cb *gbe_set_image_base_index;
+
+typedef uint32_t (gbe_get_image_base_index_cb)();
+extern gbe_get_image_base_index_cb *gbe_get_image_base_index;
+
+/*! Get the size of defined images */
+typedef size_t (gbe_kernel_get_image_size_cb)(gbe_kernel gbeKernel);
+extern gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size;
+
+/*! Get the content of defined images */
+typedef void (gbe_kernel_get_image_data_cb)(gbe_kernel gbeKernel, ImageInfo *images);
+extern gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data;
+
+/*! Get the printf number */
+typedef uint32_t (gbe_get_printf_num_cb)(void* printf_info);
+extern gbe_get_printf_num_cb *gbe_get_printf_num;
+
+/*! Get the printf buffer bti */
+typedef uint8_t (gbe_get_printf_buf_bti_cb)(void* printf_info);
+extern gbe_get_printf_buf_bti_cb *gbe_get_printf_buf_bti;
+
+typedef uint8_t (gbe_get_printf_indexbuf_bti_cb)(void* printf_info);
+extern gbe_get_printf_indexbuf_bti_cb *gbe_get_printf_indexbuf_bti;
+
+/*! Release the printfset */
+typedef void (gbe_release_printf_info_cb)(void* printf_info);
+extern gbe_release_printf_info_cb *gbe_release_printf_info;
+
+/*! Dup the printf set */
+typedef void* (gbe_dup_printfset_cb)(gbe_kernel gbeKernel);
+extern gbe_dup_printfset_cb *gbe_dup_printfset;
+
+/*! Get the printf buffer const offset */
+typedef uint32_t (gbe_get_printf_sizeof_size_cb)(void* printf_info);
+extern gbe_get_printf_sizeof_size_cb *gbe_get_printf_sizeof_size;
+
+typedef void (gbe_output_printf_cb) (void* printf_info, void* index_addr, void* buf_addr,
+                         size_t global_wk_sz0, size_t global_wk_sz1, size_t global_wk_sz2);
+extern gbe_output_printf_cb* gbe_output_printf;
+
+/*! Create a new program from the given source code (zero terminated string) */
+typedef gbe_program (gbe_program_new_from_source_cb)(uint32_t deviceID,
+                                                     const char *source,
+                                                     size_t stringSize,
+                                                     const char *options,
+                                                     char *err,
+                                                     size_t *err_size);
+extern gbe_program_new_from_source_cb *gbe_program_new_from_source;
+/*! Create a new program from the given source code and compile it (zero terminated string) */
+typedef gbe_program (gbe_program_compile_from_source_cb)(uint32_t deviceID,
+                                                         const char *source,
+                                                         const char *temp_header_path,
+                                                         size_t stringSize,
+                                                         const char *options,
+                                                         char *err,
+                                                         size_t *err_size);
+extern gbe_program_compile_from_source_cb *gbe_program_compile_from_source;
+/*! link the programs. */
+typedef void (gbe_program_link_program_cb)(gbe_program           dst_program,
+                                           gbe_program           src_program,
+                                           size_t                stringSize,
+                                           char *                err,
+                                           size_t *              errSize);
+extern gbe_program_link_program_cb *gbe_program_link_program;
+
+/*! create s new genprogram for link. */
+typedef gbe_program (gbe_program_new_gen_program_cb)(uint32_t deviceID,
+                                                     const void *module,
+                                                     const void *act);
+extern gbe_program_new_gen_program_cb *gbe_program_new_gen_program;
+
+/*! Create a new program from the given blob */
+typedef gbe_program (gbe_program_new_from_binary_cb)(uint32_t deviceID, const char *binary, size_t size);
+extern gbe_program_new_from_binary_cb *gbe_program_new_from_binary;
+
+/*! Create a new program from the llvm bitcode*/
+typedef gbe_program (gbe_program_new_from_llvm_binary_cb)(uint32_t deviceID, const char *binary, size_t size);
+extern gbe_program_new_from_llvm_binary_cb *gbe_program_new_from_llvm_binary;
+
+/*! Serialize a program to a bin, 0 means executable, 1 means llvm bitcode*/
+typedef size_t (gbe_program_serialize_to_binary_cb)(gbe_program program, char **binary, int binary_type);
+extern gbe_program_serialize_to_binary_cb *gbe_program_serialize_to_binary;
+
+/*! Create a new program from the given LLVM file */
+typedef gbe_program (gbe_program_new_from_llvm_cb)(uint32_t deviceID,
+                                                   const char *fileName,
+                                                   const void *module,
+                                                   const void *llvm_ctx,
+                                                   size_t string_size,
+                                                   char *err,
+                                                   size_t *err_size,
+                                                   int optLevel);
+extern gbe_program_new_from_llvm_cb *gbe_program_new_from_llvm;
+
+/*! create s new genprogram for link. */
+typedef gbe_program (gbe_program_new_gen_program_cb)(uint32_t deviceID,
+                                                   const void *module,
+                                                   const void *act);
+extern gbe_program_new_gen_program_cb *gbe_program_new_gen_program;
+
+/*! link the programs from llvm level. */
+typedef void (gbe_program_link_from_llvm_cb)(gbe_program dst_program,
+                                             gbe_program src_program,
+                                             size_t      stringSize,
+                                             char *      err,
+                                             size_t *    errSize);
+extern gbe_program_link_from_llvm_cb *gbe_program_link_from_llvm;
+/* build the program to gen binary */
+typedef void gbe_program_build_from_llvm_cb(gbe_program program,
+                                      size_t stringSize,
+                                      char *err,
+                                      size_t *errSize,
+                                      const char *          options);
+extern gbe_program_build_from_llvm_cb *gbe_program_build_from_llvm;
+
+/*! Get the size of global constants */
+typedef size_t (gbe_program_get_global_constant_size_cb)(gbe_program gbeProgram);
+extern gbe_program_get_global_constant_size_cb *gbe_program_get_global_constant_size;
+
+/*! Get the content of global constants */
+typedef void (gbe_program_get_global_constant_data_cb)(gbe_program gbeProgram, char *mem);
+extern gbe_program_get_global_constant_data_cb *gbe_program_get_global_constant_data;
+
+/*! Get the size of defined samplers */
+typedef size_t (gbe_kernel_get_sampler_size_cb)(gbe_kernel gbeKernel);
+extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size;
+
+/*! Get the content of defined samplers */
+typedef void (gbe_kernel_get_sampler_data_cb)(gbe_kernel gbeKernel, uint32_t *samplers);
+extern gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data;
+
+/*! Get the content of defined samplers */
+typedef void (gbe_kernel_get_compile_wg_size_cb)(gbe_kernel gbeKernel, size_t wg_sz[3]);
+extern gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size;
+
+/*! Clean LLVM resource of the given program */
+typedef void (gbe_program_clean_llvm_resource_cb)(gbe_program);
+extern gbe_program_clean_llvm_resource_cb *gbe_program_clean_llvm_resource;
+
+/*! Destroy and deallocate the given program */
+typedef void (gbe_program_delete_cb)(gbe_program);
+extern gbe_program_delete_cb *gbe_program_delete;
+
+/*! Get the number of functions in the program */
+typedef uint32_t (gbe_program_get_kernel_num_cb)(gbe_program);
+extern gbe_program_get_kernel_num_cb *gbe_program_get_kernel_num;
+
+/*! Get the kernel from its name */
+typedef gbe_kernel (gbe_program_get_kernel_by_name_cb)(gbe_program, const char *name);
+extern gbe_program_get_kernel_by_name_cb *gbe_program_get_kernel_by_name;
+
+/*! Get the kernel from its ID */
+typedef gbe_kernel (gbe_program_get_kernel_cb)(gbe_program, uint32_t ID);
+extern gbe_program_get_kernel_cb *gbe_program_get_kernel;
+
+/*! Get the kernel name */
+typedef const char *(gbe_kernel_get_name_cb)(gbe_kernel);
+extern gbe_kernel_get_name_cb *gbe_kernel_get_name;
+
+/*! Get the kernel attributes*/
+typedef const char *(gbe_kernel_get_attributes_cb)(gbe_kernel);
+extern gbe_kernel_get_attributes_cb *gbe_kernel_get_attributes;
+
+/*! Get the kernel source code */
+typedef const char *(gbe_kernel_get_code_cb)(gbe_kernel);
+extern gbe_kernel_get_code_cb *gbe_kernel_get_code;
+
+/*! Get the size of the source code */
+typedef size_t (gbe_kernel_get_code_size_cb)(gbe_kernel);
+extern gbe_kernel_get_code_size_cb *gbe_kernel_get_code_size;
+
+/*! Get the total number of arguments */
+typedef uint32_t (gbe_kernel_get_arg_num_cb)(gbe_kernel);
+extern gbe_kernel_get_arg_num_cb *gbe_kernel_get_arg_num;
+
+/*! Get the argument info */
+typedef void* (gbe_kernel_get_arg_info_cb)(gbe_kernel, uint32_t argID, uint32_t value);
+extern gbe_kernel_get_arg_info_cb *gbe_kernel_get_arg_info;
+
+/*! Get the size of the given argument */
+typedef uint32_t (gbe_kernel_get_arg_size_cb)(gbe_kernel, uint32_t argID);
+extern gbe_kernel_get_arg_size_cb *gbe_kernel_get_arg_size;
+
+/*! Get the the bti of a __global buffer */
+typedef uint8_t (gbe_kernel_get_arg_bti_cb)(gbe_kernel, uint32_t argID);
+extern gbe_kernel_get_arg_bti_cb *gbe_kernel_get_arg_bti;
+
+/*! Get the type of the given argument */
+typedef enum gbe_arg_type (gbe_kernel_get_arg_type_cb)(gbe_kernel, uint32_t argID);
+extern gbe_kernel_get_arg_type_cb *gbe_kernel_get_arg_type;
+
+/*! Get the align of the given argument */
+typedef uint32_t (gbe_kernel_get_arg_align_cb)(gbe_kernel, uint32_t argID);
+extern gbe_kernel_get_arg_align_cb *gbe_kernel_get_arg_align;
+
+/*! Get the simd width for the kernel */
+typedef uint32_t (gbe_kernel_get_simd_width_cb)(gbe_kernel);
+extern gbe_kernel_get_simd_width_cb *gbe_kernel_get_simd_width;
+
+/*! Get the curbe size required by the kernel */
+typedef int32_t (gbe_kernel_get_curbe_size_cb)(gbe_kernel);
+extern gbe_kernel_get_curbe_size_cb *gbe_kernel_get_curbe_size;
+
+/*! Get the stack size (zero if no stack is required) */
+typedef int32_t (gbe_kernel_get_stack_size_cb)(gbe_kernel);
+extern gbe_kernel_get_stack_size_cb *gbe_kernel_get_stack_size;
+
+/*! Get the scratch size (zero if no scratch is required) */
+typedef int32_t (gbe_kernel_get_scratch_size_cb)(gbe_kernel);
+extern gbe_kernel_get_scratch_size_cb *gbe_kernel_get_scratch_size;
+
+/*! Get the curbe offset where to put the data. Returns -1 if not required */
+typedef int32_t (gbe_kernel_get_curbe_offset_cb)(gbe_kernel, enum gbe_curbe_type type, uint32_t sub_type);
+extern gbe_kernel_get_curbe_offset_cb *gbe_kernel_get_curbe_offset;
+
+/*! Indicates if a work group size is required. Return the required width or 0
+ *  if none
+ */
+typedef uint32_t (gbe_kernel_get_required_work_group_size_cb)(gbe_kernel, uint32_t dim);
+extern gbe_kernel_get_required_work_group_size_cb *gbe_kernel_get_required_work_group_size;
+
+/*! Says if SLM is used. Required to reconfigure the L3 complex */
+typedef int32_t (gbe_kernel_use_slm_cb)(gbe_kernel);
+extern gbe_kernel_use_slm_cb *gbe_kernel_use_slm;
+/*! Get slm size needed for kernel local variables */
+typedef int32_t (gbe_kernel_get_slm_size_cb)(gbe_kernel);
+extern gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size;
+
+/*mutex to lock global llvmcontext access.*/
+extern void acquireLLVMContextLock();
+extern void releaseLLVMContextLock();
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* __GBE_PROGRAM_H__ */
+
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
new file mode 100644
index 0000000..56f60af
--- /dev/null
+++ b/backend/src/backend/program.hpp
@@ -0,0 +1,320 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file program.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_PROGRAM_HPP__
+#define __GBE_PROGRAM_HPP__
+
+#include "backend/program.h"
+#include "backend/context.hpp"
+#include "ir/constant.hpp"
+#include "ir/unit.hpp"
+#include "ir/function.hpp"
+#include "ir/printf.hpp"
+#include "ir/sampler.hpp"
+#include "sys/hash_map.hpp"
+#include "sys/vector.hpp"
+#include <string>
+
+namespace gbe {
+namespace ir {
+  class Unit; // Compilation unit. Contains the program to compile
+} /* namespace ir */
+} /* namespace gbe */
+
+namespace gbe {
+
+  /*! Info for the kernel argument */
+  struct KernelArgument {
+    gbe_arg_type type; //!< Pointer, structure, image, regular value?
+    uint32_t size;     //!< Size of the argument
+    uint32_t align;    //!< addr alignment of the argument
+    uint8_t bti;      //!< binding table index for __global buffer
+    ir::FunctionArgument::InfoFromLLVM info;
+  };
+
+  /*! Stores the offset where to patch where to patch */
+  struct PatchInfo {
+    INLINE PatchInfo(gbe_curbe_type type, uint32_t subType = 0u, uint32_t offset = 0u) :
+      type(uint32_t(type)), subType(subType), offset(offset) {}
+    INLINE PatchInfo(void) {}
+    uint64_t type : 16;    //!< Type of the patch (see program.h for the list)
+    uint64_t subType : 32; //!< Optional sub-type of the patch (see program.h)
+    uint64_t offset : 16; //!< Optional offset to encode
+  };
+
+  /*! We will sort PatchInfo to make binary search */
+  INLINE bool operator< (PatchInfo i0, PatchInfo i1) {
+    if (i0.type != i1.type) return i0.type < i1.type;
+    return i0.subType < i1.subType;
+  }
+
+  /*! Describe a compiled kernel */
+  class Kernel : public NonCopyable, public Serializable
+  {
+  public:
+    /*! Create an empty kernel with the given name */
+    Kernel(const std::string &name);
+    /*! Destroy it */
+    virtual ~Kernel(void);
+    /*! Return the instruction stream (to be implemented) */
+    virtual const char *getCode(void) const = 0;
+    /*! Set the instruction stream.*/
+    virtual const void setCode(const char *, size_t size) = 0;
+    /*! Return the instruction stream size (to be implemented) */
+    virtual size_t getCodeSize(void) const = 0;
+    /*! Get the kernel name */
+    INLINE const char *getName(void) const { return name.c_str(); }
+    /*! Return the number of arguments for the kernel call */
+    INLINE uint32_t getArgNum(void) const { return argNum; }
+    /*! Return the size of the given argument */
+    INLINE uint32_t getArgSize(uint32_t argID) const {
+      return argID >= argNum ? 0u : args[argID].size;
+    }
+    /*! Return the bti for __global buffer */
+    INLINE uint8_t getArgBTI(uint32_t argID) const {
+      return argID >= argNum ? 0u : args[argID].bti;
+    }
+    /*! Return the alignment of buffer argument */
+    INLINE uint32_t getArgAlign(uint32_t argID) const {
+      return argID >= argNum ? 0u : args[argID].align;
+    }
+    /*! Return the type of the given argument */
+    INLINE gbe_arg_type getArgType(uint32_t argID) const {
+      return argID >= argNum ? GBE_ARG_INVALID : args[argID].type;
+    }
+    /*! Get the offset where to patch. Returns -1 if no patch needed */
+    int32_t getCurbeOffset(gbe_curbe_type type, uint32_t subType) const;
+    /*! Get the curbe size required by the kernel */
+    INLINE uint32_t getCurbeSize(void) const { return this->curbeSize; }
+    /*! Return the size of the stack (zero if none) */
+    INLINE uint32_t getStackSize(void) const { return this->stackSize; }
+    /*! Return the size of the scratch memory needed (zero if none) */
+    INLINE uint32_t getScratchSize(void) const { return this->scratchSize; }
+    /*! Get the SIMD width for the kernel */
+    INLINE uint32_t getSIMDWidth(void) const { return this->simdWidth; }
+    /*! Says if SLM is needed for it */
+    INLINE bool getUseSLM(void) const { return this->useSLM; }
+    /*! get slm size for kernel local variable */
+    INLINE uint32_t getSLMSize(void) const { return this->slmSize; }
+    /*! Set sampler set. */
+    void setSamplerSet(ir::SamplerSet *from) {
+      samplerSet = from;
+    }
+    /*! Get defined sampler size */
+    size_t getSamplerSize(void) const { return (samplerSet == NULL ? 0 : samplerSet->getDataSize()); }
+    /*! Get defined sampler value array */
+    void getSamplerData(uint32_t *samplers) const { samplerSet->getData(samplers); }
+    /*! Set image set. */
+    void setImageSet(ir::ImageSet * from) {
+      imageSet = from;
+    }
+    /*! Set printf set. */
+    void setPrintfSet(ir::PrintfSet * from) {
+      printfSet = from;
+    }
+    /* ! Return the offset in the sizeof(xxx). */
+    uint32_t getPrintfSizeOfSize(void) const {
+      return printfSet ? printfSet->getPrintfSizeOfSize() : 0;
+    }
+    uint32_t getPrintfNum() const {
+      return printfSet ? printfSet->getPrintfNum() : 0;
+    }
+
+    void * dupPrintfSet() const {
+      void* ptr = printfSet ? (void *)(new ir::PrintfSet(*printfSet)) : NULL;
+      return ptr;
+    }
+    uint8_t getPrintfBufBTI() const {
+      GBE_ASSERT(printfSet);
+      return printfSet->getBufBTI();
+    }
+
+    uint8_t getPrintfIndexBufBTI() const {
+      GBE_ASSERT(printfSet);
+      return printfSet->getIndexBufBTI();
+    }
+
+    void outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
+                      size_t global_wk_sz1, size_t global_wk_sz2) {
+      if(printfSet)
+        printfSet->outputPrintf(index_addr, buf_addr, global_wk_sz0,
+                                global_wk_sz1, global_wk_sz2);
+    }
+
+    ir::FunctionArgument::InfoFromLLVM* getArgInfo(uint32_t id) const { return &args[id].info; }
+
+    /*! Set compile work group size */
+    void setCompileWorkGroupSize(const size_t wg_sz[3]) {
+       compileWgSize[0] = wg_sz[0];
+       compileWgSize[1] = wg_sz[1];
+       compileWgSize[2] = wg_sz[2];
+    }
+    /*! Get compile work group size */
+    void getCompileWorkGroupSize (size_t wg_sz[3]) const {
+       wg_sz[0] = compileWgSize[0];
+       wg_sz[1] = compileWgSize[1];
+       wg_sz[2] = compileWgSize[2];
+    }
+    /*! Set function attributes string. */
+    void setFunctionAttributes(const std::string& functionAttributes) {  this->functionAttributes= functionAttributes; }
+    /*! Get function attributes string. */
+    const char* getFunctionAttributes(void) const {return this->functionAttributes.c_str();}
+
+    /*! Get defined image size */
+    size_t getImageSize(void) const { return (imageSet == NULL ? 0 : imageSet->getDataSize()); }
+    /*! Get defined image value array */
+    void getImageData(ImageInfo *images) const { imageSet->getData(images); }
+
+    static const uint32_t magic_begin = TO_MAGIC('K', 'E', 'R', 'N');
+    static const uint32_t magic_end = TO_MAGIC('N', 'R', 'E', 'K');
+
+    /* format:
+       magic_begin       |
+       name_size         |
+       name              |
+       arg_num           |
+       args              |
+       PatchInfo_num     |
+       PatchInfo         |
+       curbeSize         |
+       simdWidth         |
+       stackSize         |
+       scratchSize       |
+       useSLM            |
+       slmSize           |
+       samplers          |
+       images            |
+       code_size         |
+       code              |
+       magic_end
+    */
+
+    /*! Implements the serialization. */
+    virtual size_t serializeToBin(std::ostream& outs);
+    virtual size_t deserializeFromBin(std::istream& ins);
+    virtual void printStatus(int indent, std::ostream& outs);
+
+  protected:
+    friend class Context;      //!< Owns the kernels
+    friend class GenContext;
+    std::string name;    //!< Kernel name
+    KernelArgument *args;      //!< Each argument
+    vector<PatchInfo> patches; //!< Indicates how to build the curbe
+    uint32_t argNum;           //!< Number of function arguments
+    uint32_t curbeSize;        //!< Size of the data to push
+    uint32_t simdWidth;        //!< SIMD size for the kernel (lane number)
+    uint32_t stackSize;        //!< Stack size (may be 0 if unused)
+    uint32_t scratchSize;      //!< Scratch memory size (may be 0 if unused)
+    bool useSLM;               //!< SLM requires a special HW config
+    uint32_t slmSize;          //!< slm size for kernel variable
+    Context *ctx;              //!< Save context after compiler to alloc constant buffer curbe
+    ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
+    ir::ImageSet *imageSet;    //!< Copy from the corresponding function.
+    ir::PrintfSet *printfSet;  //!< Copy from the corresponding function.
+    size_t compileWgSize[3];   //!< required work group size by kernel attribute.
+    std::string functionAttributes; //!< function attribute qualifiers combined.
+    GBE_CLASS(Kernel);         //!< Use custom allocators
+  };
+
+  /*! Describe a compiled program */
+  class Program : public NonCopyable, public Serializable
+  {
+  public:
+    /*! Create an empty program */
+    Program(void);
+    /*! Destroy the program */
+    virtual ~Program(void);
+    /*! Clean LLVM resource of the program */
+    virtual void CleanLlvmResource() = 0;
+    /*! Get the number of kernels in the program */
+    uint32_t getKernelNum(void) const { return kernels.size(); }
+    /*! Get the kernel from its name */
+    Kernel *getKernel(const std::string &name) const {
+      auto it = kernels.find(name);
+      if (it == kernels.end())
+        return NULL;
+      else
+        return it->second;
+    }
+    /*! Get the kernel from its ID */
+    Kernel *getKernel(uint32_t ID) const {
+      uint32_t currID = 0;
+      Kernel *kernel = NULL;
+      for (const auto &pair : kernels) {
+        if (currID == ID) {
+          kernel = pair.second;
+          break;
+        }
+        currID++;
+      }
+      return kernel;
+    }
+    /*! Build a program from a ir::Unit */
+    bool buildFromUnit(const ir::Unit &unit, std::string &error);
+    /*! Buils a program from a LLVM source code */
+    bool buildFromLLVMFile(const char *fileName, const void* module, std::string &error, int optLevel);
+    /*! Buils a program from a OCL string */
+    bool buildFromSource(const char *source, std::string &error);
+    /*! Get size of the global constant arrays */
+    size_t getGlobalConstantSize(void) const { return constantSet->getDataSize(); }
+    /*! Get the content of global constant arrays */
+    void getGlobalConstantData(char *mem) const { constantSet->getData(mem); }
+
+    static const uint32_t magic_begin = TO_MAGIC('P', 'R', 'O', 'G');
+    static const uint32_t magic_end = TO_MAGIC('G', 'O', 'R', 'P');
+
+    /* format:
+       magic_begin       |
+       constantSet_flag  |
+       constSet_data     |
+       kernel_num        |
+       kernel_1          |
+       ........          |
+       kernel_n          |
+       magic_end         |
+       total_size
+    */
+
+    /*! Implements the serialization. */
+    virtual size_t serializeToBin(std::ostream& outs);
+    virtual size_t deserializeFromBin(std::istream& ins);
+    virtual void printStatus(int indent, std::ostream& outs);
+
+  protected:
+    /*! Compile a kernel */
+    virtual Kernel *compileKernel(const ir::Unit &unit, const std::string &name, bool relaxMath) = 0;
+    /*! Allocate an empty kernel. */
+    virtual Kernel *allocateKernel(const std::string &name) = 0;
+    /*! Kernels sorted by their name */
+    hash_map<std::string, Kernel*> kernels;
+    /*! Global (constants) outside any kernel */
+    ir::ConstantSet *constantSet;
+    /*! Use custom allocators */
+    GBE_CLASS(Program);
+  };
+
+} /* namespace gbe */
+
+#endif /* __GBE_PROGRAM_HPP__ */
+
diff --git a/backend/src/builtin_vector_proto.def b/backend/src/builtin_vector_proto.def
new file mode 100644
index 0000000..18d23ca
--- /dev/null
+++ b/backend/src/builtin_vector_proto.def
@@ -0,0 +1,295 @@
+##math
+gentype acos (gentype)
+gentype acosh (gentype)
+gentype acospi (gentype x)
+gentype asin (gentype)
+gentype asinh (gentype)
+gentype asinpi (gentype x)
+gentype atan (gentype y_over_x)
+gentype atan2 (gentype y, gentype x)
+gentype atanh (gentype)
+gentype atanpi (gentype x)
+gentype atan2pi (gentype y, gentype x)
+gentype cbrt (gentype)
+gentype ceil (gentype)
+gentype copysign (gentype x, gentype y)
+gentype cos (gentype)
+gentype cosh (gentype)
+gentype cospi (gentype x)
+gentype erfc (gentype)
+gentype erf (gentype)
+gentype exp (gentype x)
+gentype exp2 (gentype)
+gentype exp10 (gentype)
+gentype expm1 (gentype x)
+gentype fabs (gentype)
+gentype fdim (gentype x, gentype y)
+gentype floor (gentype)
+# XXX we use madd for fma
+#gentype fma (gentype a, gentype b, gentype c)
+gentype fmax (gentype x, gentype y)
+gentypef fmax (gentypef x, float y)
+gentyped fmax (gentyped x, double y)
+gentype fmin (gentype x, gentype y)
+gentypef fmin (gentypef x, float y)
+gentyped fmin (gentyped x, double y)
+gentype fmod (gentype x, gentype y)
+gentype fract (gentype x, __global gentype *iptr)
+gentype fract (gentype x, __local gentype *iptr)
+gentype fract (gentype x, __private gentype *iptr)
+floatn frexp (floatn x, __global intn *exp)
+floatn frexp (floatn x, __local intn *exp)
+floatn frexp (floatn x, __private intn *exp)
+float frexp (float x, __global int *exp)
+float frexp (float x, __local int *exp)
+float frexp (float x, __private int *exp)
+doublen frexp (doublen x, __global intn *exp)
+doublen frexp (doublen x, __local intn *exp)
+doublen frexp (doublen x, __private intn *exp)
+double frexp (double x, __global int *exp)
+double frexp (double x, __local int *exp)
+double frexp (double x, __private int *exp)
+gentype hypot (gentype x, gentype y)
+intn ilogb (floatn x)
+int ilogb (float x)
+intn ilogb (doublen x)
+int ilogb (double x)
+floatn ldexp (floatn x, intn k)
+floatn ldexp (floatn x, int k)
+float ldexp (float x, int k)
+doublen ldexp (doublen x, intn k)
+doublen ldexp (doublen x, int k)
+double ldexp (double x, int k)
+gentype lgamma (gentype x)
+floatn lgamma_r (floatn x, __global intn *signp)
+floatn lgamma_r (floatn x, __local intn *signp)
+floatn lgamma_r (floatn x, __private intn *signp)
+float lgamma_r (float x, __global int *signp)
+float lgamma_r (float x, __local int *signp)
+float lgamma_r (float x,   __private int *signp)
+#doublen lgamma_r (doublen x, __global intn *signp)
+#doublen lgamma_r (doublen x, __local intn *signp)
+#doublen lgamma_r (doublen x, __private intn *signp)
+#double lgamma_r (double x, __global int *signp)
+#double lgamma_r (double x, __local int *signp)
+#double lgamma_r (double x, __private int *signp)
+gentype log (gentype)
+gentype log2 (gentype)
+gentype log10 (gentype)
+gentype log1p (gentype x)
+gentype logb (gentype x)
+gentype mad (gentype a, gentype b, gentype c)
+gentype maxmag (gentype x, gentype y)
+gentype minmag (gentype x, gentype y)
+gentype modf (gentype x, __global gentype *iptr)
+gentype modf (gentype x, __local gentype *iptr)
+gentype modf (gentype x, __private gentype *iptr)
+floatn nan (uintn nancode)
+float nan (uint nancode)
+doublen nan (ulongn nancode)
+double nan (ulong nancode)
+gentype nextafter (gentype x, gentype y)
+gentype pow (gentype x, gentype y)
+floatn pown (floatn x, intn y)
+float pown (float x, int y)
+doublen pown (doublen x, intn y)
+double pown (double x, int y)
+#XXX we define powr as pow
+#gentype powr (gentype x, gentype y)
+gentype remainder (gentype x, gentype y)
+floatn remquo (floatn x, floatn y, __global intn *quo)
+floatn remquo (floatn x, floatn y, __local intn *quo)
+floatn remquo (floatn x, floatn y, __private intn *quo)
+float remquo (float x, float y, __global int *quo)
+float remquo (float x, float y, __local int *quo)
+float remquo (float x, float y, __private int *quo)
+doublen remquo (doublen x, doublen y, __global intn *quo)
+doublen remquo (doublen x, doublen y, __local intn *quo)
+doublen remquo (doublen x, doublen y, __private intn *quo)
+double remquo (double x, double y, __global int *quo)
+double remquo (double x, double y, __local int *quo)
+double remquo (double x, double y, __private int *quo)
+gentype rint (gentype)
+floatn rootn (floatn x, intn y)
+
+doublen rootn (doublen x, intn y)
+doublen rootn (double x, int y)
+gentype round (gentype x)
+gentype rsqrt (gentype)
+gentype sin (gentype)
+gentype sincos (gentype x, __global gentype *cosval)
+gentype sincos (gentype x, __local gentype *cosval)
+gentype sincos (gentype x, __private gentype *cosval)
+gentype sinh (gentype)
+gentype sinpi (gentype x)
+gentype sqrt (gentype)
+gentype tan (gentype)
+gentype tanh (gentype)
+gentype tanpi (gentype x)
+gentype tgamma (gentype)
+gentype trunc (gentype)
+
+##math function fast path
+gentype __gen_ocl_internal_fastpath_acosh (gentype x)
+gentype __gen_ocl_internal_fastpath_asinh (gentype x)
+gentype __gen_ocl_internal_fastpath_atanh (gentype x)
+gentype __gen_ocl_internal_fastpath_cbrt (gentype x)
+gentype __gen_ocl_internal_fastpath_cos (gentype x)
+gentype __gen_ocl_internal_fastpath_cosh (gentype x)
+gentype __gen_ocl_internal_fastpath_cospi (gentype x)
+gentype __gen_ocl_internal_fastpath_exp (gentype x)
+gentype __gen_ocl_internal_fastpath_exp10 (gentype x)
+gentype __gen_ocl_internal_fastpath_expm1 (gentype x)
+gentype __gen_ocl_internal_fastpath_fmod (gentype x, gentype y)
+gentype __gen_ocl_internal_fastpath_hypot (gentype x, gentype y)
+intn __gen_ocl_internal_fastpath_ilogb (floatn x)
+int __gen_ocl_internal_fastpath_ilogb (float x)
+intn __gen_ocl_internal_fastpath_ilogb (doublen x)
+int __gen_ocl_internal_fastpath_ilogb (double x)
+floatn __gen_ocl_internal_fastpath_ldexp (floatn x, intn k)
+floatn __gen_ocl_internal_fastpath_ldexp (floatn x, int k)
+float __gen_ocl_internal_fastpath_ldexp (float x, int k)
+doublen __gen_ocl_internal_fastpath_ldexp (doublen x, intn k)
+doublen __gen_ocl_internal_fastpath_ldexp (doublen x, int k)
+double __gen_ocl_internal_fastpath_ldexp (double x, int k)
+gentype __gen_ocl_internal_fastpath_log (gentype x)
+gentype __gen_ocl_internal_fastpath_log2 (gentype x)
+gentype __gen_ocl_internal_fastpath_log10 (gentype x)
+gentype __gen_ocl_internal_fastpath_log1p (gentype x)
+gentype __gen_ocl_internal_fastpath_logb (gentype x)
+gentype __gen_ocl_internal_fastpath_remainder (gentype x, gentype y)
+floatn __gen_ocl_internal_fastpath_rootn (floatn x, intn k)
+gentype __gen_ocl_internal_fastpath_sin (gentype x)
+gentype __gen_ocl_internal_fastpath_sincos (gentype x, __global gentype *cosval)
+gentype __gen_ocl_internal_fastpath_sincos (gentype x, __local gentype *cosval)
+gentype __gen_ocl_internal_fastpath_sincos (gentype x, __private gentype *cosval)
+gentype __gen_ocl_internal_fastpath_sinh (gentype x)
+gentype __gen_ocl_internal_fastpath_sinpi (gentype x)
+gentype __gen_ocl_internal_fastpath_tan (gentype x)
+gentype __gen_ocl_internal_fastpath_tanh (gentype x)
+
+##half_native_math
+#gentype half_cos (gentype x)
+#gentype half_divide (gentype x, gentype y)
+#gentype half_exp (gentype x)
+#gentype half_exp2 (gentype x)
+#gentype half_exp10 (gentype x)
+#gentype half_log (gentype x)
+#gentype half_log2 (gentype x)
+#gentype half_log10 (gentype x)
+#gentype half_powr (gentype x, gentype y)
+#gentype half_recip (gentype x)
+#gentype half_rsqrt (gentype x)
+#gentype half_sin (gentype x)
+#gentype half_sqrt (gentype x)
+#gentype half_tan (gentype x)
+
+# XXX we already defined all native and non-native
+# functions to the same one.
+gentype native_cos (gentype x)
+gentype native_divide (gentype x, gentype y)
+gentype native_exp (gentype x)
+#gentype native_exp2 (gentype x)
+gentype native_exp10 (gentype x)
+gentype native_log (gentype x)
+gentype native_log2 (gentype x)
+gentype native_log10 (gentype x)
+gentype native_powr (gentype x, gentype y)
+gentype native_recip (gentype x)
+gentype native_rsqrt (gentype x)
+gentype native_sin (gentype x)
+#gentype native_sqrt (gentype x)
+gentype native_tan (gentype x)
+
+##integer
+ugentype abs (gentype x)
+ugentype abs_diff (gentype x, gentype y)
+gentype add_sat (gentype x,  gentype y)
+gentype hadd (gentype x,  gentype y)
+gentype rhadd (gentype x, gentype y)
+gentype clamp (gentype x, gentype minval, gentype maxval)
+gentype clamp (gentype x, sgentype minval, sgentype maxval)
+gentype clz (gentype x)
+gentype mad_hi (gentype a, gentype b, gentype c)
+gentype mad_sat (gentype a, gentype b, gentype c)
+gentype max (gentype x,  gentype y)
+gentype max (gentype x,  sgentype y)
+gentype min (gentype x,  gentype y)
+gentype min (gentype x,  sgentype y)
+gentype mul_hi (gentype x,  gentype y)
+gentype rotate (gentype v,  gentype i)
+gentype sub_sat (gentype x,  gentype y)
+shortn upsample (charn hi, ucharn lo)
+ushortn upsample (ucharn hi, ucharn lo)
+intn upsample (shortn hi, ushortn lo)
+uintn upsample (ushortn hi, ushortn lo)
+longn upsample (intn hi, uintn lo)
+ulongn upsample (uintn hi, uintn lo)
+# XXX not implemented
+#gentype popcount (gentype x)
+
+##fast_integer
+gentype mad24 (gentype x, gentype y, gentype z)
+gentype mul24 (gentype x, gentype y)
+
+##common
+gentype clamp (gentype x, gentype minval, gentype maxval)
+gentypef clamp (gentypef x, float minval, float maxval)
+gentyped clamp (gentyped x, double minval, double maxval)
+gentype degrees (gentype radians)
+gentype max (gentype x,  gentype y)
+gentypef max (gentypef x, float y)
+gentyped max (gentyped x, double y)
+gentype min (gentype x,  gentype y)
+gentypef min (gentypef x,  float y)
+gentyped min (gentyped x,  double y)
+gentype mix (gentype x, gentype y, gentype a)
+gentypef mix (gentypef x, gentypef y, float a)
+gentyped mix (gentyped x, gentyped y, double a)
+gentype radians (gentype degrees)
+gentype step (gentype edge, gentype x)
+gentypef step (float edge, gentypef x)
+gentyped step (double edge, gentyped x)
+gentype smoothstep (gentype edge0, gentype edge1, gentype x)
+gentypef smoothstep (float edge0, float edge1, gentypef x)
+gentyped smoothstep (double edge0, double edge1, gentyped x)
+gentype sign (gentype x)
+
+##relational
+intn isequal (floatn x, floatn y)
+longn isequal (doublen x, doublen y)
+intn isnotequal (floatn x, floatn y)
+longn isnotequal (doublen x, doublen y)
+intn isgreater (floatn x, floatn y)
+longn isgreater (doublen x, doublen y)
+intn isgreaterequal (floatn x, floatn y)
+longn isgreaterequal (doublen x, doublen y)
+intn isless (floatn x, floatn y)
+longn isless (doublen x, doublen y)
+intn islessequal (floatn x, floatn y)
+longn islessequal (doublen x, doublen y)
+intn islessgreater (floatn x, floatn y)
+longn islessgreater (doublen x, doublen y)
+intn isfinite (floatn
+longn isfinite (doublen)
+intn isinf (floatn)
+longn isinf (doublen)
+intn isnan (floatn)
+longn isnan (doublen)
+intn isnormal (floatn)
+longn isnormal (doublen)
+intn isordered (floatn x, floatn y)
+longn isordered (doublen x, doublen y)
+intn isunordered (floatn x, floatn y)
+longn isunordered (doublen x, doublen y)
+intn signbit (floatn)
+longn signbit (doublen)
+int any (igentype x)
+int all (igentype x)
+gentype bitselect (gentype a, gentype b, gentype c)
+gentype select (gentype a, gentype b, igentype c)
+gentype select (gentype a, gentype b, ugentype c)
+
+##misc
+#gentypen shuffle (gentypem x, ugentypen mask)
+#gentypen shuffle2 (gentypem x, gentypem y, ugentypen mask)
diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp
new file mode 100644
index 0000000..79e3935
--- /dev/null
+++ b/backend/src/gbe_bin_generater.cpp
@@ -0,0 +1,437 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/*******************************************************************************
+   This file is used to generating the gbe kernel binary.  These binary may be
+   used in CL API, such as enqueue memory We generate the binary in build time
+   to improve the performance.
+ *******************************************************************************/
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <fstream>
+#include <deque>
+#include <vector>
+#include <algorithm>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "backend/program.h"
+#include "backend/program.hpp"
+#include "backend/src/sys/platform.hpp"
+#include "src/cl_device_data.h"
+
+using namespace std;
+
+#define FILE_NOT_FIND_ERR 1
+#define FILE_MAP_ERR 2
+#define FILE_BUILD_FAILED 3
+#define FILE_SERIALIZATION_FAILED 4
+
+static uint32_t gen_pci_id = 0;
+
+class program_build_instance {
+
+protected:
+    string prog_path;
+    string build_opt;
+    static string bin_path;
+    static bool str_fmt_out;
+    int fd;
+    int file_len;
+    const char* code;
+    gbe::Program* gbe_prog;
+
+public:
+    program_build_instance (void) : fd(-1), file_len(0), code(NULL), gbe_prog(NULL) { }
+    explicit program_build_instance (const char* file_path, const char* option = NULL)
+        : prog_path(file_path), build_opt(option), fd(-1), file_len(0),
+          code(NULL), gbe_prog(NULL) { }
+
+    ~program_build_instance () {
+        if (code) {
+            munmap((void *)(code), file_len);
+            code = NULL;
+        }
+
+        if (fd >= 0)
+            close(fd);
+
+        if (gbe_prog)
+            gbe_program_delete(reinterpret_cast<gbe_program>(gbe_prog));
+    }
+
+    program_build_instance(program_build_instance&& other) = default;
+#if 0
+    {
+#define SWAP(ELT) \
+	do { \
+	    auto elt = this->ELT; \
+	    this->ELT = other.ELT; \
+	    other.ELT = elt; \
+	} while(0)
+
+        SWAP(fd);
+        SWAP(code);
+        SWAP(file_len);
+        SWAP(prog_path);
+        SWAP(build_opt);
+#undef SWAP
+    }
+#endif
+
+    explicit program_build_instance(const program_build_instance& other) = delete;
+    program_build_instance& operator= (const program_build_instance& other) {
+        /* we do not want to be Lvalue copied, but operator is needed to instance the
+           template of vector<program_build_instance>. */
+        assert(1);
+        return *this;
+    }
+
+    const char* file_map_open (void) throw (int);
+
+    const char* get_code (void) {
+        return code;
+    }
+
+    const string& get_program_path (void) {
+        return prog_path;
+    }
+
+    int get_size (void) {
+        return file_len;
+    }
+
+    void print_file (void) {
+        cout << code << endl;
+    }
+
+    void dump (void) {
+        cout << "program path: " << prog_path << endl;
+        cout << "Build option: " << build_opt << endl;
+        print_file();
+    }
+
+    static void set_str_fmt_out (bool flag) {
+        str_fmt_out = flag;
+    }
+
+    static int set_bin_path (const char* path) {
+        if (bin_path.size())
+            return 0;
+
+        bin_path = path;
+        return 1;
+    }
+
+    void build_program(void) throw(int);
+    void serialize_program(void) throw(int);
+};
+
+string program_build_instance::bin_path;
+bool program_build_instance::str_fmt_out = false;
+#define OUTS_UPDATE_SZ(elt) SERIALIZE_OUT(elt, oss, header_sz)
+#define OUTF_UPDATE_SZ(elt) SERIALIZE_OUT(elt, ofs, header_sz)
+
+void program_build_instance::serialize_program(void) throw(int)
+{
+    ofstream ofs;
+    ostringstream oss;
+    size_t sz = 0, header_sz = 0;
+    ofs.open(bin_path, ofstream::out | ofstream::trunc | ofstream::binary);
+
+    char src_hw_info[4]="";
+    if(IS_IVYBRIDGE(gen_pci_id)){
+      src_hw_info[0]='I';
+      src_hw_info[1]='V';
+      src_hw_info[2]='B';
+      if(IS_BAYTRAIL_T(gen_pci_id)){
+        src_hw_info[0]='B';
+        src_hw_info[1]='Y';
+        src_hw_info[2]='T';
+      }
+    }else if(IS_HASWELL(gen_pci_id)){
+        src_hw_info[0]='H';
+        src_hw_info[1]='S';
+        src_hw_info[2]='W';
+    }
+
+    if (str_fmt_out) {
+
+      if(gen_pci_id){
+        //add header to differeciate from llvm bitcode binary.
+        // (5 bytes: 1 byte for binary type, 4 byte for bc code, 'GENC' is for gen binary.)
+        char gen_header[6] = "\0GENC";
+        OUTS_UPDATE_SZ(gen_header[0]);
+        OUTS_UPDATE_SZ(gen_header[1]);
+        OUTS_UPDATE_SZ(gen_header[2]);
+        OUTS_UPDATE_SZ(gen_header[3]);
+        OUTS_UPDATE_SZ(gen_header[4]);
+        OUTS_UPDATE_SZ(src_hw_info[0]);
+        OUTS_UPDATE_SZ(src_hw_info[1]);
+        OUTS_UPDATE_SZ(src_hw_info[2]);
+      }
+
+      string array_name = "Unknown_name_array";
+      unsigned long last_slash = bin_path.rfind("/");
+      unsigned long last_dot = bin_path.rfind(".");
+
+      if (last_slash != string::npos &&  last_dot != string::npos)
+        array_name = bin_path.substr(last_slash + 1, last_dot - 1 - last_slash);
+
+      ofs << "#include <stddef.h>" << "\n";
+      ofs << "char " << array_name << "[] = {" << "\n";
+
+      if(gen_pci_id){
+        sz = gbe_prog->serializeToBin(oss);
+        sz += header_sz;
+      }else{
+        char *llvm_binary;
+        size_t bin_length = gbe_program_serialize_to_binary((gbe_program)gbe_prog, &llvm_binary, 1);
+        oss.write(llvm_binary, bin_length);
+        sz += bin_length;
+      }
+
+      for (size_t i = 0; i < sz; i++) {
+        unsigned char c = oss.str().c_str()[i];
+        char asic_str[9];
+        sprintf(asic_str, "%2.2x", c);
+        ofs << "0x";
+        ofs << asic_str << ((i == sz - 1) ? "" : ", ");
+      }
+      ofs << "};\n";
+
+      string array_size = array_name + "_size";
+      ofs << "size_t " << array_size << " = " << sz << ";" << "\n";
+    } else {
+      if(gen_pci_id){
+        //add header to differeciate from llvm bitcode binary.
+        // (5 bytes: 1 byte for binary type, 4 byte for bc code, 'GENC' is for gen binary.)
+        char gen_header[6] = "\0GENC";
+        OUTF_UPDATE_SZ(gen_header[0]);
+        OUTF_UPDATE_SZ(gen_header[1]);
+        OUTF_UPDATE_SZ(gen_header[2]);
+        OUTF_UPDATE_SZ(gen_header[3]);
+        OUTF_UPDATE_SZ(gen_header[4]);
+        OUTF_UPDATE_SZ(src_hw_info[0]);
+        OUTF_UPDATE_SZ(src_hw_info[1]);
+        OUTF_UPDATE_SZ(src_hw_info[2]);
+        sz = gbe_prog->serializeToBin(ofs);
+      }else{
+        char *llvm_binary;
+        size_t bin_length = gbe_program_serialize_to_binary((gbe_program)gbe_prog, &llvm_binary, 1);
+        ofs.write(llvm_binary, bin_length);
+        sz+=bin_length;
+      }
+    }
+
+    ofs.close();
+
+    if (!sz) {
+        throw FILE_SERIALIZATION_FAILED;
+    }
+}
+
+
+void program_build_instance::build_program(void) throw(int)
+{
+    gbe_program  opaque = NULL;
+    if(gen_pci_id){
+      opaque = gbe_program_new_from_source(gen_pci_id, code, 0, build_opt.c_str(), NULL, NULL);
+    }else{
+      opaque = gbe_program_compile_from_source(0, code, NULL, 0, build_opt.c_str(), NULL, NULL);
+    }
+    if (!opaque)
+        throw FILE_BUILD_FAILED;
+
+    gbe_prog = reinterpret_cast<gbe::Program*>(opaque);
+
+    if(gen_pci_id){
+      assert(gbe_program_get_kernel_num(opaque));
+    }
+}
+
+const char* program_build_instance::file_map_open(void) throw(int)
+{
+    void * address;
+
+    /* Open the file */
+    fd = ::open(prog_path.c_str(), O_RDONLY);
+    if (fd < 0) {
+        throw FILE_NOT_FIND_ERR;
+    }
+
+    /* Map it */
+    file_len = lseek(fd, 0, SEEK_END);
+    lseek(fd, 0, SEEK_SET);
+    address = mmap(0, file_len, PROT_READ, MAP_SHARED, fd, 0);
+    if (address == NULL) {
+        throw FILE_MAP_ERR;
+    }
+
+    code = reinterpret_cast<const char*>(address);
+    return code;
+}
+
+typedef vector<program_build_instance> prog_vector;
+
+int main (int argc, const char **argv)
+{
+    prog_vector prog_insts;
+    vector<string> argv_saved;
+    const char* build_opt;
+    const char* file_path;
+    int i;
+    int oc;
+    deque<int> used_index;
+
+    if (argc < 2) {
+        cout << "Usage: kernel_path [-pbuild_parameter]\n[-obin_path]" << endl;
+        return 0;
+    }
+
+    used_index.assign(argc, 0);
+
+    /* because getopt will re-sort the argv, so we save here. */
+    for (i=0; i< argc; i++) {
+        argv_saved.push_back(string(argv[i]));
+    }
+
+    while ( (oc = getopt(argc, (char * const *)argv, "t:o:p:s")) != -1 ) {
+        switch (oc) {
+        case 'p':
+        {
+            int opt_index;
+
+            if (argv[optind-1][0] == '-') {// -pXXX like
+                opt_index = optind - 1;
+            } else { // Must be -p XXXX mode
+                opt_index = optind - 2;
+                used_index[opt_index + 1] = 1;
+            }
+
+            /* opt must follow the file name.*/
+            if ((opt_index < 2 ) || argv[opt_index-1][0] == '-') {
+                cout << "Usage note: Building option must follow file name" << endl;
+                return 1;
+            }
+
+            file_path = argv[opt_index - 1];
+            build_opt = optarg;
+
+            prog_insts.push_back(program_build_instance(file_path, build_opt));
+            break;
+        }
+
+        case 'o':
+            if (!program_build_instance::set_bin_path(optarg)) {
+                cout << "Can not specify the bin path more than once." << endl;
+                return 1;
+            }
+            used_index[optind-1] = 1;
+            break;
+
+        case 't':
+        {
+            char *s = optarg;
+            if (optarg[0] == '0' && (optarg[1] == 'x' || optarg[1] == 'X'))
+            s += 2;
+
+            if (s[0] < '0' || s[0] > '9') {
+                cout << "Invalid target option argument" << endl;
+                return 1;
+            }
+
+            std::stringstream str(s);
+            str >> std::hex >> gen_pci_id;
+
+            used_index[optind-1] = 1;
+            break;
+        }
+
+        case 's':
+            program_build_instance::set_str_fmt_out(true);
+            used_index[optind-1] = 1;
+            break;
+
+        case ':':
+            cout << "Miss the file option argument" << endl;
+            return 1;
+
+        default:
+            cout << "Unknown opt" << endl;
+        }
+    }
+
+    for (i=1; i < argc; i++) {
+        //cout << argv_saved[i] << endl;
+        if (argv_saved[i].size() && argv_saved[i][0] != '-') {
+            if (used_index[i])
+                continue;
+
+            string file_name = argv_saved[i];
+            prog_vector::iterator result = find_if(prog_insts.begin(), prog_insts.end(),
+            [&](program_build_instance & prog_inst)-> bool {
+                bool result = false;
+                if (prog_inst.get_program_path() == file_name)
+                    result = true;
+
+                return result;
+            });
+
+            if (result == prog_insts.end()) {
+                prog_insts.push_back(program_build_instance(file_name.c_str(), ""));
+            }
+        }
+    }
+
+    for (auto& inst : prog_insts) {
+        try {
+            inst.file_map_open();
+            inst.build_program();
+            inst.serialize_program();
+        }
+        catch (int & err_no) {
+            if (err_no == FILE_NOT_FIND_ERR) {
+                cout << "can not open the file " <<
+                     inst.get_program_path() << endl;
+            } else if (err_no == FILE_MAP_ERR) {
+                cout << "map the file " <<
+                     inst.get_program_path() << " failed" << endl;
+            } else if (err_no == FILE_BUILD_FAILED) {
+                cout << "build the file " <<
+                     inst.get_program_path() << " failed" << endl;
+            } else if (err_no == FILE_SERIALIZATION_FAILED) {
+                cout << "Serialize the file " <<
+                     inst.get_program_path() << " failed" << endl;
+            }
+            return -1;
+        }
+    }
+
+    //for (auto& inst : prog_insts) {
+    //    inst.dump();
+    //}
+
+    return 0;
+}
diff --git a/backend/src/gbe_bin_interpreter.cpp b/backend/src/gbe_bin_interpreter.cpp
new file mode 100644
index 0000000..1c67a4b
--- /dev/null
+++ b/backend/src/gbe_bin_interpreter.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "sys/alloc.cpp"
+#include "sys/cvar.cpp"
+#include "sys/assert.cpp"
+#include "sys/platform.cpp"
+#include "ir/constant.cpp"
+#include "ir/printf.cpp"
+
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#undef GBE_COMPILER_AVAILABLE
+#include "backend/program.cpp"
+#include "backend/gen_program.cpp"
+#include "ir/sampler.cpp"
+#include "ir/image.cpp"
+
+struct BinInterpCallBackInitializer
+{
+  BinInterpCallBackInitializer() {
+    gbe_program_new_from_binary = gbe::genProgramNewFromBinary;
+    gbe_program_get_kernel_num = gbe::programGetKernelNum;
+    gbe_program_get_kernel_by_name = gbe::programGetKernelByName;
+    gbe_program_get_kernel = gbe::programGetKernel;
+    gbe_kernel_get_code_size = gbe::kernelGetCodeSize;
+    gbe_kernel_get_code = gbe::kernelGetCode;
+    gbe_kernel_get_arg_num = gbe::kernelGetArgNum;
+    gbe_kernel_get_curbe_size = gbe::kernelGetCurbeSize;
+    gbe_kernel_get_sampler_size = gbe::kernelGetSamplerSize;
+    gbe_kernel_get_compile_wg_size = gbe::kernelGetCompileWorkGroupSize;
+    gbe_kernel_get_stack_size = gbe::kernelGetStackSize;
+    gbe_kernel_get_image_size = gbe::kernelGetImageSize;
+    gbe_kernel_get_name = gbe::kernelGetName;
+    gbe_kernel_get_attributes = gbe::kernelGetAttributes;
+    gbe_kernel_get_arg_type = gbe::kernelGetArgType;
+    gbe_kernel_get_arg_size = gbe::kernelGetArgSize;
+    gbe_kernel_get_arg_bti = gbe::kernelGetArgBTI;
+    gbe_kernel_get_simd_width = gbe::kernelGetSIMDWidth;
+    gbe_kernel_get_scratch_size = gbe::kernelGetScratchSize;
+    gbe_kernel_use_slm = gbe::kernelUseSLM;
+    gbe_kernel_get_required_work_group_size = gbe::kernelGetRequiredWorkGroupSize;
+    gbe_kernel_get_curbe_offset = gbe::kernelGetCurbeOffset;
+    gbe_kernel_get_slm_size = gbe::kernelGetSLMSize;
+    gbe_kernel_get_arg_align = gbe::kernelGetArgAlign;
+    gbe_program_get_global_constant_size = gbe::programGetGlobalConstantSize;
+    gbe_program_delete = gbe::programDelete;
+    gbe_program_get_global_constant_data = gbe::programGetGlobalConstantData;
+    gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
+    gbe_kernel_get_image_data = gbe::kernelGetImageData;
+    gbe_kernel_get_arg_info = gbe::kernelGetArgInfo;
+    gbe_get_printf_num = gbe::kernelGetPrintfNum;
+    gbe_get_printf_buf_bti = gbe::kernelGetPrintfBufBTI;
+    gbe_get_printf_indexbuf_bti = gbe::kernelGetPrintfIndexBufBTI;
+    gbe_dup_printfset = gbe::kernelDupPrintfSet;
+    gbe_get_printf_sizeof_size = gbe::kernelGetPrintfSizeOfSize;
+    gbe_release_printf_info = gbe::kernelReleasePrintfSet;
+    gbe_output_printf = gbe::kernelOutputPrintf;
+  }
+
+  ~BinInterpCallBackInitializer() {
+  }
+};
+
+static struct BinInterpCallBackInitializer binInterpCB;
diff --git a/backend/src/gen_as.sh b/backend/src/gen_as.sh
new file mode 100755
index 0000000..7dea15d
--- /dev/null
+++ b/backend/src/gen_as.sh
@@ -0,0 +1,101 @@
+#! /bin/sh -e
+
+. ./genconfig.sh
+
+# Generate list of union sizes
+for type in $TYPES; do
+        size=`IFS=:; set -- dummy $type; echo $3`
+        for vector_length in $VECTOR_LENGTHS; do
+                if test $vector_length -eq 3; then
+                      continue;
+                fi
+                union_sizes="$union_sizes `expr $vector_length \* $size`"
+        done
+done
+union_sizes="`echo $union_sizes | tr ' ' '\n' | sort -n | uniq`"
+
+# For each union size
+for union_size in $union_sizes; do
+
+        # Define an union that contains all vector types that have the same size as the union
+        unionname="union _type_cast_${union_size}_b"
+        echo "$unionname {"
+        for type in $TYPES; do
+                basetype=`IFS=:; set -- dummy $type; echo $2`
+                basesize=`IFS=:; set -- dummy $type; echo $3`
+                for vector_length in $VECTOR_LENGTHS; do
+                        if test $vector_length -eq 3; then
+                                vector_size_length="4"
+                        else
+                                vector_size_length=$vector_length;
+                        fi
+                        vector_size_in_union="`expr $vector_size_length \* $basesize`"
+                        if test $union_size -ne $vector_size_in_union; then
+                                continue
+                        fi
+                        if test $vector_length -eq 1; then
+                                vectortype=$basetype
+                        else
+                                vectortype=$basetype$vector_length
+                        fi
+                        echo "  $vectortype _$vectortype;"
+                done
+
+        done
+        echo "};"
+        echo
+
+        # For each tuple of vector types that has the same size as the current union size,
+        # define an as_* function that converts types without changing binary representation.
+        for ftype in $TYPES; do
+                fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+                fbasesize=`IFS=:; set -- dummy $ftype; echo $3`
+                for fvector_length in $VECTOR_LENGTHS; do
+                        if test $fvector_length -eq 3; then
+                                fvector_size_length="4"
+                        else
+                                fvector_size_length=$fvector_length;
+                        fi
+                        fvector_size_in_union="`expr $fvector_size_length \* $fbasesize`"
+                        if test $union_size -ne $fvector_size_in_union; then
+                                continue
+                        fi
+                        if test $fvector_length -eq 1; then
+                                fvectortype=$fbasetype
+                        else
+                                fvectortype=$fbasetype$fvector_length
+                        fi
+                        for ttype in $TYPES; do
+                                tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+                                tbasesize=`IFS=:; set -- dummy $ttype; echo $3`
+                                if test $fbasetype = $tbasetype; then
+                                        continue
+                                fi
+                                for tvector_length in $VECTOR_LENGTHS; do
+                                        if test $tvector_length -eq 3; then
+                                               tvector_size_length="4"
+                                        else
+                                               tvector_size_length=$tvector_length;
+                                        fi
+                                        tvector_size_in_union="`expr $tvector_size_length \* $tbasesize`"
+                                        if test $union_size -ne $tvector_size_in_union; then
+                                                continue
+                                        fi
+                                        if test $tvector_length -eq 1; then
+                                                tvectortype=$tbasetype
+                                        else
+                                                tvectortype=$tbasetype$tvector_length
+                                        fi
+                                        echo "INLINE OVERLOADABLE $tvectortype as_$tvectortype($fvectortype v) {"
+                                        echo "  $unionname u;"
+                                        echo "  u._$fvectortype = v;"
+                                        echo "  return u._$tvectortype;"
+                                        echo "}"
+                                        echo
+                                done
+                        done
+                done
+
+        done
+
+done
diff --git a/backend/src/gen_builtin_vector.py b/backend/src/gen_builtin_vector.py
new file mode 100755
index 0000000..2d602c8
--- /dev/null
+++ b/backend/src/gen_builtin_vector.py
@@ -0,0 +1,384 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2012 Intel Corporation
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library. If not, see <http://www.gnu.org/licenses/>.
+#
+# Author: Zhigang Gong <zhigang.gong at linux.intel.com>
+#/
+
+# This file is to generate inline code to lower down those builtin
+# vector functions to scalar functions.
+import re
+import sys
+import os
+
+if len(sys.argv) != 3:
+    print "Invalid argument {0}".format(sys.argv)
+    print "use {0} spec_file_name output_file_name".format(sys.argv[0])
+    raise
+
+all_vector = 1,2,3,4,8,16
+
+# generate generic type sets
+def gen_vector_type(type_set, vector_set = all_vector):
+    ret = []
+    for t in type_set:
+        for i in vector_set:
+            ret.append((t, i))
+    return ret
+
+def set_vector_memspace(vector_type_set, memspace):
+    ret = []
+    if memspace == '':
+        return vector_type_set
+    for t in vector_type_set:
+        ret.append((t[0], t[1], memspace))
+    return ret
+
+# if we have 3 elements in the type tuple, we are a pointer with a memory space type
+# at the third element.
+def isPointer(t):
+    return len(t) == 3
+
+all_itype = "char","short","int","long"
+all_utype = "uchar","ushort","uint","ulong"
+all_int_type = all_itype + all_utype
+
+all_float_type = "float","double"
+all_type = all_int_type + all_float_type
+
+# all vector/scalar types
+for t in all_type:
+    exec "{0}n = [\"{0}n\", gen_vector_type([\"{0}\"])]".format(t)
+    exec "s{0} = [\"{0}\", gen_vector_type([\"{0}\"], [1])]".format(t)
+
+# Predefined type sets according to the Open CL spec.
+math_gentype = ["math_gentype", gen_vector_type(all_float_type)]
+math_gentypef = ["math_gentypef", gen_vector_type(["float"])]
+math_gentyped = ["math_gentyped", gen_vector_type(["double"])]
+
+half_native_math_gentype = ["half_native_math_gentype", gen_vector_type(["float"])]
+
+integer_gentype = ["integer_gentype", gen_vector_type(all_int_type)]
+integer_ugentype = ["integer_ugentype", gen_vector_type(all_utype)]
+integer_sgentype = ["integer_sgentype", gen_vector_type(all_int_type, [1])]
+
+fast_integer_gentype = ["fast_integer_gentype", gen_vector_type(["uint", "int"])]
+
+common_gentype = ["common_gentype", gen_vector_type(all_float_type)]
+common_gentypef = ["common_gentypef", gen_vector_type(["float"])]
+common_gentyped = ["common_gentyped", gen_vector_type(["double"])]
+
+relational_gentype = ["relational_gentype", gen_vector_type(all_type)]
+relational_igentype = ["relational_igentype", gen_vector_type(all_itype)]
+relational_ugentype = ["relational_ugentype", gen_vector_type(all_utype)]
+
+misc_gentypem = ["misc_gentypem", gen_vector_type(all_type, [2, 4, 8, 16])]
+misc_gentypen = ["misc_gentypen", gen_vector_type(all_type, [2, 4, 8, 16])]
+misc_ugentypem = ["misc_ugentypem", gen_vector_type(all_utype, [2, 4, 8, 16])]
+misc_ugentypen = ["misc_ugentypen", gen_vector_type(all_utype, [2, 4, 8, 16])]
+
+all_predefined_type = math_gentype, math_gentypef, math_gentyped,                \
+                      half_native_math_gentype, integer_gentype,integer_sgentype,\
+                      integer_ugentype, charn, ucharn, shortn, ushortn, intn,    \
+                      uintn, longn, ulongn, floatn, doublen,                     \
+                      fast_integer_gentype, common_gentype, common_gentypef,     \
+                      common_gentyped, relational_gentype, relational_igentype,  \
+                      relational_ugentype, schar, suchar, sshort, sint, suint,   \
+                      slong, sulong, sfloat, sdouble, misc_gentypem,              \
+                      misc_ugentypem, misc_gentypen, misc_ugentypen
+
+# type dictionary contains all the predefined type sets.
+type_dict = {}
+
+for t in all_predefined_type:
+    type_dict.update({t[0]:t[1]})
+
+def _prefix(prefix, dtype):
+    if dtype.count("gentype") != 0:
+        return prefix + '_' + dtype
+    return dtype
+
+memspaces = ["__local ", "__private ", "__global "]
+
+def stripMemSpace(t):
+    if t[0:2] == '__':
+        for memspace in memspaces :
+            if t[0:len(memspace)] == memspace:
+                return memspace, t[len(memspace):]
+    return '', t
+
+def check_type(types):
+    for t in types:
+        memspace, t = stripMemSpace(t)
+        if not t in type_dict:
+            print t
+            raise "found invalid type."
+
+def match_unsigned(dtype):
+    if dtype[0] == 'float':
+        return ["uint", dtype[1]]
+    if dtype[0] == 'double':
+        return ["ulong", dtype[1]]
+    if dtype[0][0] == 'u':
+        return dtype
+    return ['u' + dtype[0], dtype[1]]
+
+def match_signed(dtype):
+    if dtype[0] == 'float':
+        return ["int", dtype[1]]
+    if dtype[0] == 'double':
+        return ["long", dtype[1]]
+    if dtype[0][0] != 'u':
+        return dtype
+    return [dtype[0][1:], dtype[1]]
+
+def match_scalar(dtype):
+    return [dtype[0], 1]
+
+# The dstType is the expected type, srcType is
+# the reference type. Sometimes, the dstType and
+# srcType are different. We need to fix this issue
+# and return correct dst type.
+def fixup_type(dstType, srcType, n):
+    if dstType == srcType:
+       return dstType[n]
+
+    if dstType != srcType:
+        # scalar dst type
+        if len(dstType) == 1:
+            return dstType[0]
+        # dst is not scalar bug src is scalar
+        if len(srcType) == 1:
+            return dstType[n]
+        if dstType == integer_sgentype[1] and srcType == integer_gentype[1]:
+            return match_scalar(srcType[n])
+
+        if dstType == integer_gentype[1] and  \
+           (srcType == integer_sgentype[1] or \
+            srcType == integer_ugentype[1]):
+            return dstType[n]
+
+        if dstType == integer_ugentype[1] and srcType == integer_gentype[1]:
+            return match_unsigned(srcType[n])
+
+        if dstType == relational_igentype[1] and srcType == relational_gentype[1]:
+            return match_signed(srcType[n])
+        if dstType == relational_ugentype[1] and srcType == relational_gentype[1]:
+            return match_unsigned(srcType[n])
+
+        if dstType == relational_gentype[1] and    \
+           (srcType == relational_igentype[1] or   \
+            srcType == relational_ugentype[1]):
+            return dstType[n]
+
+        if (len(dstType) == len(srcType)):
+            return dstType[n]
+
+    print dstType, srcType
+    raise "type mispatch"
+
+class builtinProto():
+    valueTypeStr = ""
+    functionName = ""
+    paramTypeStrs = []
+    paramCount = 0
+    outputStr = []
+    prefix = ""
+
+    def init(self, sectionHeader, sectionPrefix):
+        self.valueTypeStr = ""
+        self.functionName = ""
+        self.paramTypeStrs = []
+        self.paramCount = 0
+        if sectionHeader != "":
+            self.outputStr = [sectionHeader]
+        else:
+            self.outputStr = []
+        if sectionPrefix != "":
+            self.prefix = sectionPrefix
+        self.indent = 0
+
+    def append(self, line, nextInit = ""):
+        self.outputStr.append(line);
+        return nextInit;
+
+    def indentSpace(self):
+        ret = ""
+        for i in range(self.indent):
+            ret += ' '
+
+        return ret
+
+    def init_from_line(self, t):
+        self.append('//{0}'.format(t))
+        line = filter(None, re.split(',| |\(', t.rstrip(')\n')))
+        self.paramCount = 0
+        stripped = 0
+        memSpace = ''
+        for i, text in enumerate(line):
+            idx = i - stripped
+            if idx == 0:
+                self.valueTypeStr = _prefix(self.prefix, line[i])
+                continue
+
+            if idx == 1:
+                self.functionName = line[i];
+                continue
+
+            if idx % 2 == 0:
+                if line[i][0] == '(':
+                    tmpType = line[i][1:]
+                else:
+                    tmpType = line[i]
+                if tmpType == '__local' or   \
+                   tmpType == '__private' or \
+                   tmpType == '__global':
+                   memSpace = tmpType + ' '
+                   stripped += 1
+                   continue
+                self.paramTypeStrs.append(memSpace + _prefix(self.prefix, tmpType))
+                memSpace = ''
+                self.paramCount += 1
+
+    def gen_proto_str_1(self, vtypeSeq, ptypeSeqs, i):
+        for n in range(0, self.paramCount):
+            ptype = fixup_type(ptypeSeqs[n], vtypeSeq, i);
+            vtype = fixup_type(vtypeSeq, ptypeSeqs[n], i);
+            # XXX FIXME now skip all double vector, as we don't
+            # defined those scalar version's prototype.
+            if ptype[0].find('double') != -1 or \
+               vtype[0].find('double') != -1:
+                return
+
+            if (n == 0):
+                formatStr = 'INLINE_OVERLOADABLE {0}{1} {2} ('.format(vtype[0], vtype[1], self.functionName)
+            else:
+                formatStr += ', '
+
+            if vtype[1] == 1:
+                return
+
+            if isPointer(ptype):
+                formatStr += ptype[2]
+                pointerStr = '*'
+            else:
+                pointerStr = ''
+
+            if ptype[1] != 1:
+                formatStr += '{0}{1} {2}param{3}'.format(ptype[0], ptype[1], pointerStr, n)
+            else:
+                formatStr += '{0} {1}param{2}'.format(ptype[0], pointerStr, n)
+
+        formatStr += ')'
+        formatStr = self.append(formatStr, '{{return ({0}{1})('.format(vtype[0], vtype[1]))
+        self.indent = len(formatStr)
+        for j in range(0, vtype[1]):
+            if (j != 0):
+                formatStr += ','
+                if (j + 1) % 2 == 0:
+                    formatStr += ' '
+                if j % 2 == 0:
+                    formatStr = self.append(formatStr, self.indentSpace())
+
+            if self.prefix == 'relational' and self.functionName != 'bitselect' and self.functionName != 'select':
+                formatStr += '-'
+            formatStr += '{0}('.format(self.functionName)
+            for n in range(0, self.paramCount):
+                if n != 0:
+                    formatStr += ', '
+
+                ptype = fixup_type(ptypeSeqs[n], vtypeSeq, i)
+                vtype = fixup_type(vtypeSeq, ptypeSeqs[n], i)
+                if vtype[1] != ptype[1]:
+                    if ptype[1] != 1:
+                        raise "parameter is not a scalar but has different width with result value."
+                    if isPointer(ptype):
+                        formatStr += '&'
+                    formatStr += 'param{0}'.format(n)
+                    continue
+
+                if (isPointer(ptype)):
+                    formatStr += '({0} {1} *)param{2} + {3:2d}'.format(ptype[2], ptype[0], n, j)
+                else:
+                    if (self.functionName == 'select' and n == 2):
+                        formatStr += '({0})(param{1}.s{2:X} & (({0})1 << (sizeof({0})*8 - 1)))'.format(ptype[0], n, j)
+                    else:
+                        formatStr += 'param{0}.s{1:X}'.format(n, j)
+
+            formatStr += ')'
+
+        formatStr += '); }\n'
+        self.append(formatStr)
+
+        return formatStr
+
+    def output(self):
+        for line in self.outputStr:
+            print line
+
+    def output(self, outFile):
+        for line in self.outputStr:
+            outFile.write('{0}\n'.format(line))
+
+    def gen_proto_str(self):
+        check_type([self.valueTypeStr] + self.paramTypeStrs)
+        vtypeSeq = type_dict[self.valueTypeStr]
+        ptypeSeqs = []
+        count = len(vtypeSeq);
+        for t in self.paramTypeStrs:
+            memspace,t = stripMemSpace(t)
+            ptypeSeqs.append(set_vector_memspace(type_dict[t], memspace))
+            count = max(count, len(type_dict[t]))
+
+        for i in range(count):
+            formatStr = self.gen_proto_str_1(vtypeSeq, ptypeSeqs, i)
+
+        self.append("")
+
+def safeUnlink(filename):
+    try:
+        os.remove(filename)
+    except OSError:
+        pass
+
+# save the prototypes into ocl_vector.h
+specFile = open(sys.argv[1], 'r')
+headerFileName = sys.argv[2]
+tempHeaderFileName = sys.argv[2] + '.tmp'
+safeUnlink(headerFileName)
+tempHeader = open(tempHeaderFileName, 'w')
+
+tempHeader.write("//This file is autogenerated by {0}.\n".format(sys.argv[0]))
+tempHeader.write("//Don't modify it manually.\n")
+
+functionProto = builtinProto()
+for line in specFile:
+    if line.isspace():
+        continue
+    if line[0] == '#':
+        if line[1] == '#':
+            sectionHeader = "//{0} builtin functions".format(line[2:].rstrip())
+            sectionPrefix=(line[2:].split())[0]
+        continue
+    functionProto.init(sectionHeader, sectionPrefix)
+    sectionHeader = ""
+    setionPrefix = ""
+    functionProto.init_from_line(line)
+    functionProto.gen_proto_str()
+    functionProto.output(tempHeader)
+
+tempHeader.close()
+os.rename(tempHeaderFileName, headerFileName)
diff --git a/backend/src/gen_convert.sh b/backend/src/gen_convert.sh
new file mode 100755
index 0000000..b940222
--- /dev/null
+++ b/backend/src/gen_convert.sh
@@ -0,0 +1,553 @@
+#! /bin/sh -e
+
+. ./genconfig.sh
+
+# For all vector lengths and types, generate conversion functions
+for vector_length in $VECTOR_LENGTHS; do
+        if test $vector_length -eq 1; then
+          for ftype in $TYPES; do
+            fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+            for ttype in $TYPES; do
+              tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+              echo "INLINE OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) {"
+              echo "  return ($tbasetype)v;"
+              echo "}"
+              echo
+            done
+          done
+        else
+          for ftype in $TYPES; do
+                fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+                for ttype in $TYPES; do
+                        tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+                        if test $fbasetype = $tbasetype; then
+                          if test $vector_length -gt 1; then
+                            fvectortype=$fbasetype$vector_length
+                            tvectortype=$tbasetype$vector_length
+                            echo "INLINE OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) { return v; }"
+                          else
+                            echo "INLINE OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) { return v; }"
+                          fi
+                          continue
+                        fi
+                        fvectortype=$fbasetype$vector_length
+                        tvectortype=$tbasetype$vector_length
+                        construct="($tbasetype)(v.s0)"
+                        if test $vector_length -gt 1; then
+                                construct="$construct, ($tbasetype)(v.s1)"
+                        fi
+                        if test $vector_length -gt 2; then
+                                construct="$construct, ($tbasetype)(v.s2)"
+                        fi
+                        if test $vector_length -gt 3; then
+                                construct="$construct, ($tbasetype)(v.s3)"
+                        fi
+                        if test $vector_length -gt 4; then
+                                construct="$construct, ($tbasetype)(v.s4)"
+                                construct="$construct, ($tbasetype)(v.s5)"
+                                construct="$construct, ($tbasetype)(v.s6)"
+                                construct="$construct, ($tbasetype)(v.s7)"
+                        fi
+                        if test $vector_length -gt 8; then
+                                construct="$construct, ($tbasetype)(v.s8)"
+                                construct="$construct, ($tbasetype)(v.s9)"
+                                construct="$construct, ($tbasetype)(v.sA)"
+                                construct="$construct, ($tbasetype)(v.sB)"
+                                construct="$construct, ($tbasetype)(v.sC)"
+                                construct="$construct, ($tbasetype)(v.sD)"
+                                construct="$construct, ($tbasetype)(v.sE)"
+                                construct="$construct, ($tbasetype)(v.sF)"
+                        fi
+
+                        echo "INLINE OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) {"
+                        echo "  return ($tvectortype)($construct);"
+                        echo "}"
+                        echo
+                done
+          done
+        fi
+done
+
+echo '
+#define DEF(DSTTYPE, SRCTYPE) \
+  OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);
+DEF(char, uchar);
+DEF(char, short);
+DEF(char, ushort);
+DEF(char, int);
+DEF(char, uint);
+DEF(char, float);
+DEF(uchar, char);
+DEF(uchar, short);
+DEF(uchar, ushort);
+DEF(uchar, int);
+DEF(uchar, uint);
+DEF(uchar, float);
+DEF(short, ushort);
+DEF(short, int);
+DEF(short, uint);
+DEF(short, float);
+DEF(ushort, short);
+DEF(ushort, int);
+DEF(ushort, uint);
+DEF(ushort, float);
+DEF(int, uint);
+DEF(int, float);
+DEF(uint, int);
+DEF(uint, float);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE, MIN, MAX) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x >= MAX ? (DSTTYPE)MAX : x <= MIN ? (DSTTYPE)MIN : x; \
+  }
+DEF(char, long, -128, 127);
+DEF(uchar, long, 0, 255);
+DEF(short, long, -32768, 32767);
+DEF(ushort, long, 0, 65535);
+DEF(int, long, -0x7fffffff-1, 0x7fffffff);
+DEF(uint, long, 0, 0xffffffffu);
+DEF(long, float, -9.223372036854776e+18f, 9.223372036854776e+18f);
+DEF(ulong, float, 0, 1.8446744073709552e+19f);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE, MAX) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x >= MAX ? (DSTTYPE)MAX : x; \
+  }
+DEF(char, ulong, 127);
+DEF(uchar, ulong, 255);
+DEF(short, ulong, 32767);
+DEF(ushort, ulong, 65535);
+DEF(int, ulong, 0x7fffffff);
+DEF(uint, ulong, 0xffffffffu);
+#undef DEF
+
+INLINE_OVERLOADABLE long convert_long_sat(ulong x) {
+  ulong MAX = 0x7ffffffffffffffful;
+  return x >= MAX ? MAX : x;
+}
+
+#define DEF(DSTTYPE, SRCTYPE) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x <= 0 ? 0 : x; \
+  }
+DEF(ushort, char);
+DEF(uint, char);
+DEF(uint, short);
+DEF(ulong, char);
+DEF(ulong, short);
+DEF(ulong, int);
+DEF(ulong, long);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x; \
+  }
+DEF(char, char);
+DEF(uchar, uchar);
+DEF(short, char);
+DEF(short, uchar);
+DEF(short, short);
+DEF(ushort, uchar);
+DEF(ushort, ushort);
+DEF(int, char);
+DEF(int, uchar);
+DEF(int, short);
+DEF(int, ushort);
+DEF(int, int);
+DEF(uint, uchar);
+DEF(uint, ushort);
+DEF(uint, uint);
+DEF(long, char);
+DEF(long, uchar);
+DEF(long, short);
+DEF(long, ushort);
+DEF(long, int);
+DEF(long, uint);
+DEF(long, long);
+DEF(ulong, uchar);
+DEF(ulong, ushort);
+DEF(ulong, uint);
+DEF(ulong, ulong);
+#undef DEF
+'
+
+# vector convert_DSTTYPE_sat function
+for vector_length in $VECTOR_LENGTHS; do
+  if test $vector_length -eq 1; then continue; fi
+
+  for ftype in $TYPES; do
+    fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+    if test $fbasetype = "double"; then continue; fi
+
+    for ttype in $TYPES; do
+      tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+      if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
+
+      fvectortype=$fbasetype$vector_length
+      tvectortype=$tbasetype$vector_length
+      conv="convert_${tbasetype}_sat"
+
+      construct="$conv(v.s0)"
+      if test $vector_length -gt 1; then
+        construct="$construct, $conv(v.s1)"
+      fi
+      if test $vector_length -gt 2; then
+        construct="$construct, $conv(v.s2)"
+      fi
+      if test $vector_length -gt 3; then
+        construct="$construct, $conv(v.s3)"
+      fi
+      if test $vector_length -gt 4; then
+        construct="$construct, $conv(v.s4)"
+        construct="$construct, $conv(v.s5)"
+        construct="$construct, $conv(v.s6)"
+        construct="$construct, $conv(v.s7)"
+      fi
+      if test $vector_length -gt 8; then
+        construct="$construct, $conv(v.s8)"
+        construct="$construct, $conv(v.s9)"
+        construct="$construct, $conv(v.sA)"
+        construct="$construct, $conv(v.sB)"
+        construct="$construct, $conv(v.sC)"
+        construct="$construct, $conv(v.sD)"
+        construct="$construct, $conv(v.sE)"
+        construct="$construct, $conv(v.sF)"
+      fi
+
+      echo "INLINE OVERLOADABLE $tvectortype convert_${tvectortype}_sat($fvectortype v) {"
+      echo "  return ($tvectortype)($construct);"
+      echo "}"
+      echo
+    done
+  done
+done
+
+echo '
+float __gen_ocl_rndz(float x);
+float __gen_ocl_rnde(float x);
+float __gen_ocl_rndu(float x);
+float __gen_ocl_rndd(float x);
+INLINE_OVERLOADABLE float __convert_float_rtz(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;
+  if((l > x && x > 0) || x >= 0x7fffffc000000000 ||
+     (l < x && x < 0)) {
+      u.u -= 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;  //can not use u.f < x
+  if(l < x && x < 0x7fffffc000000000) {
+    if(x > 0)
+      u.u = u.u + 1;
+    else
+      u.u = u.u - 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;  //avoid overflow
+  if(l > x || x >= 0x7fffffc000000000) {
+    if(x > 0)
+      u.u = u.u - 1;
+    else
+      u.u = u.u + 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(ulong x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong l = u.f;
+  if(l > x  || x >= 0xffffff8000000000)
+      u.u -= 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(ulong x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong l = u.f;  //can not use u.f < x
+  if(l < x && x < 0xffffff8000000000)
+    u.u = u.u + 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(ulong x)
+{
+  return __convert_float_rtz(x);
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long i = u.f;
+  if((i > x && x > 0) ||
+     (i < x && x < 0)) {
+      u.u -= 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  int i = u.f;
+  if(i < x) {
+    if(x > 0)
+      u.u += 1;
+    else
+      u.u -= 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long i = u.f;  //avoid overflow
+  if(i > x) {
+    if(x > 0)
+      u.u = u.u - 1;
+    else
+      u.u = u.u + 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(uint x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong i = u.f;
+  if(i > x)
+    u.u -= 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(uint x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  uint i = u.f;
+  if(i < x)
+    u.u += 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(uint x)
+{
+  return __convert_float_rtz(x);
+}
+'
+
+# convert_DSTTYPE_ROUNDING function
+for vector_length in $VECTOR_LENGTHS; do
+  for ftype in $TYPES; do
+    fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+    if test $fbasetype = "double"; then continue; fi
+
+    for ttype in $TYPES; do
+      tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+      if test $tbasetype = "double"; then continue; fi
+
+      if test $vector_length -eq 1; then
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rte($fbasetype x)"
+        if test $fbasetype = "float" -a $tbasetype != "float"; then
+          echo "{ return __gen_ocl_rnde(x); }"
+        else
+          echo "{ return x; }"
+        fi
+
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rtz($fbasetype x)"
+        if test $fbasetype = "float" -a $tbasetype != "float"; then
+          echo "{ return __gen_ocl_rndz(x); }"
+        elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+          echo "{ return __convert_${tbasetype}_rtz(x); }"
+        else
+          echo "{ return x; }"
+        fi
+
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rtp($fbasetype x)"
+        if test $fbasetype = "float" -a $tbasetype != "float"; then
+          echo "{ return __gen_ocl_rndu(x); }"
+        elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+          echo "{ return __convert_${tbasetype}_rtp(x); }"
+        else
+          echo "{ return x; }"
+        fi
+
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rtn($fbasetype x)"
+        if test $fbasetype = "float" -a $tbasetype != "float"; then
+          echo "{ return __gen_ocl_rndd(x); }"
+        elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+          echo "{ return __convert_${tbasetype}_rtn(x); }"
+        else
+          echo "{ return x; }"
+        fi
+
+        continue
+      fi
+
+      for rounding in $ROUNDING_MODES; do
+        fvectortype=$fbasetype$vector_length
+        tvectortype=$tbasetype$vector_length
+        conv="convert_${tbasetype}_${rounding}"
+
+        construct="$conv(v.s0)"
+        if test $vector_length -gt 1; then
+          construct="$construct, $conv(v.s1)"
+        fi
+        if test $vector_length -gt 2; then
+          construct="$construct, $conv(v.s2)"
+        fi
+        if test $vector_length -gt 3; then
+          construct="$construct, $conv(v.s3)"
+        fi
+        if test $vector_length -gt 4; then
+          construct="$construct, $conv(v.s4)"
+          construct="$construct, $conv(v.s5)"
+          construct="$construct, $conv(v.s6)"
+          construct="$construct, $conv(v.s7)"
+        fi
+        if test $vector_length -gt 8; then
+          construct="$construct, $conv(v.s8)"
+          construct="$construct, $conv(v.s9)"
+          construct="$construct, $conv(v.sA)"
+          construct="$construct, $conv(v.sB)"
+          construct="$construct, $conv(v.sC)"
+          construct="$construct, $conv(v.sD)"
+          construct="$construct, $conv(v.sE)"
+          construct="$construct, $conv(v.sF)"
+        fi
+
+        echo "INLINE OVERLOADABLE $tvectortype convert_${tvectortype}_${rounding}($fvectortype v) {"
+        echo "  return ($tvectortype)($construct);"
+        echo "}"
+        echo
+      done
+    done
+  done
+done
+
+# convert_DSTTYPE_sat_ROUNDING function
+for vector_length in $VECTOR_LENGTHS; do
+  for ftype in $TYPES; do
+    fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+    if test $fbasetype = "double"; then continue; fi
+
+    for ttype in $TYPES; do
+      tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+      if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
+
+      if test $vector_length -eq 1; then
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rte($fbasetype x)"
+        if test $fbasetype = "float"; then
+          echo "{ return convert_${tbasetype}_sat(__gen_ocl_rnde(x)); }"
+        else
+          echo "{ return convert_${tbasetype}_sat(x); }"
+        fi
+
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtz($fbasetype x)"
+        if test $fbasetype = "float"; then
+          echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndz(x)); }"
+        else
+          echo "{ return convert_${tbasetype}_sat(x); }"
+        fi
+
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtp($fbasetype x)"
+        if test $fbasetype = "float"; then
+          echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndu(x)); }"
+        else
+          echo "{ return convert_${tbasetype}_sat(x); }"
+        fi
+
+        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtn($fbasetype x)"
+        if test $fbasetype = "float"; then
+          echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndd(x)); }"
+        else
+          echo "{ return convert_${tbasetype}_sat(x); }"
+        fi
+
+        continue
+      fi
+
+      for rounding in $ROUNDING_MODES; do
+        fvectortype=$fbasetype$vector_length
+        tvectortype=$tbasetype$vector_length
+        conv="convert_${tbasetype}_sat_${rounding}"
+
+        construct="$conv(v.s0)"
+        if test $vector_length -gt 1; then
+          construct="$construct, $conv(v.s1)"
+        fi
+        if test $vector_length -gt 2; then
+          construct="$construct, $conv(v.s2)"
+        fi
+        if test $vector_length -gt 3; then
+          construct="$construct, $conv(v.s3)"
+        fi
+        if test $vector_length -gt 4; then
+          construct="$construct, $conv(v.s4)"
+          construct="$construct, $conv(v.s5)"
+          construct="$construct, $conv(v.s6)"
+          construct="$construct, $conv(v.s7)"
+        fi
+        if test $vector_length -gt 8; then
+          construct="$construct, $conv(v.s8)"
+          construct="$construct, $conv(v.s9)"
+          construct="$construct, $conv(v.sA)"
+          construct="$construct, $conv(v.sB)"
+          construct="$construct, $conv(v.sC)"
+          construct="$construct, $conv(v.sD)"
+          construct="$construct, $conv(v.sE)"
+          construct="$construct, $conv(v.sF)"
+        fi
+
+        echo "INLINE OVERLOADABLE $tvectortype convert_${tvectortype}_sat_${rounding}($fvectortype v) {"
+        echo "  return ($tvectortype)($construct);"
+        echo "}"
+        echo
+      done
+    done
+  done
+done
diff --git a/backend/src/genconfig.sh b/backend/src/genconfig.sh
new file mode 100644
index 0000000..689499e
--- /dev/null
+++ b/backend/src/genconfig.sh
@@ -0,0 +1,11 @@
+#! /bin/false
+# This is to be sourced by the generation scripts
+
+# Supported base types and their lengths
+TYPES="long:8 ulong:8 int:4 uint:4 short:2 ushort:2 char:1 uchar:1 double:8 float:4"
+
+# Supported vector lengths
+VECTOR_LENGTHS="1 2 3 4 8 16"
+
+ROUNDING_MODES="rte rtz rtp rtn"
+## No user serviceable parts below here
diff --git a/backend/src/ir/constant.cpp b/backend/src/ir/constant.cpp
new file mode 100644
index 0000000..a38d392
--- /dev/null
+++ b/backend/src/ir/constant.cpp
@@ -0,0 +1,141 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file constant.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "constant.hpp"
+
+namespace gbe {
+namespace ir {
+
+  void ConstantSet::append(const char *data,
+                           const std::string &name,
+                           uint32_t size,
+                           uint32_t alignment)
+  {
+    const uint32_t offset = ALIGN(this->data.size(), alignment);
+    const uint32_t padding = offset - this->data.size();
+    const Constant constant(name, size, alignment, offset);
+    constants.push_back(constant);
+    for (uint32_t i = 0; i < padding; ++i) this->data.push_back(0);
+    for (uint32_t i = 0; i < size; ++i) this->data.push_back(data[i]);
+  }
+
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+  size_t ConstantSet::serializeToBin(std::ostream& outs) {
+    size_t ret_size = 0;
+
+    OUT_UPDATE_SZ(magic_begin);
+
+    /* output the const data. */
+    OUT_UPDATE_SZ((data.size()*sizeof(char)));
+    if(data.size() > 0) {
+      outs.write(data.data(), data.size()*sizeof(char));
+      ret_size += data.size()*sizeof(char);
+    }
+
+    OUT_UPDATE_SZ(constants.size());
+    for (auto const &cnst : constants) {
+      size_t bytes = sizeof(cnst.getName().size())        //name length self
+                     + cnst.getName().size()*sizeof(char) //name
+                     + sizeof(cnst.getSize())             //size
+                     + sizeof(cnst.getAlignment())        //alignment
+                     + sizeof(cnst.getOffset());	        //offset
+      OUT_UPDATE_SZ(bytes);
+
+      OUT_UPDATE_SZ(cnst.getName().size());
+      outs.write(cnst.getName().c_str(), cnst.getName().size());
+      ret_size += sizeof(char)*cnst.getName().size();
+      OUT_UPDATE_SZ(cnst.getSize());
+      OUT_UPDATE_SZ(cnst.getAlignment());
+      OUT_UPDATE_SZ(cnst.getOffset());
+    }
+
+    OUT_UPDATE_SZ(magic_end);
+    OUT_UPDATE_SZ(ret_size);
+
+    return ret_size;
+  }
+
+  size_t ConstantSet::deserializeFromBin(std::istream& ins) {
+    size_t total_size = 0;
+    size_t global_data_sz = 0;
+    size_t const_num;
+    uint32_t magic;
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_begin)
+      return 0;
+
+    IN_UPDATE_SZ(global_data_sz);
+    for (size_t i = 0; i < global_data_sz; i++) {
+      char elt;
+      IN_UPDATE_SZ(elt);
+      data.push_back(elt);
+    }
+
+    IN_UPDATE_SZ(const_num);
+    for (size_t i = 0; i < const_num; i++) {
+      size_t bytes;
+      IN_UPDATE_SZ(bytes);
+
+      size_t name_len;
+      IN_UPDATE_SZ(name_len);
+
+      char* c_name = new char[name_len+1];
+      ins.read(c_name, name_len);
+      total_size += sizeof(char)*name_len;
+      c_name[name_len] = 0;
+
+      uint32_t size, align, offset;
+      IN_UPDATE_SZ(size);
+      IN_UPDATE_SZ(align);
+      IN_UPDATE_SZ(offset);
+
+      ir::Constant constant(c_name, size, align, offset);
+      constants.push_back(constant);
+
+      delete[] c_name;
+
+      /* Saint check */
+      if (bytes != sizeof(name_len) + sizeof(char)*name_len + sizeof(size)
+              + sizeof(align) + sizeof(offset))
+        return 0;
+    }
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_end)
+      return 0;
+
+    size_t total_bytes;
+    IN_UPDATE_SZ(total_bytes);
+    if (total_bytes + sizeof(total_size) != total_size)
+      return 0;
+
+    return total_size;
+  }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/constant.hpp b/backend/src/ir/constant.hpp
new file mode 100644
index 0000000..70d09aa
--- /dev/null
+++ b/backend/src/ir/constant.hpp
@@ -0,0 +1,134 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file constant.cpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_CONSTANT_HPP__
+#define __GBE_IR_CONSTANT_HPP__
+
+#include "sys/vector.hpp"
+
+namespace gbe {
+namespace ir {
+
+  /*! Describe one constant (may be a scalar or an array) */
+  class Constant
+  {
+  public:
+    /*! Build a constant description */
+    INLINE Constant(const std::string &name, uint32_t size, uint32_t alignment, uint32_t offset) :
+      name(name), size(size), alignment(alignment), offset(offset) {}
+    /*! Copy constructor */
+    INLINE Constant(const Constant &other) :
+      name(other.name), size(other.size), alignment(other.alignment), offset(other.offset) {}
+    /*! Copy operator */
+    INLINE Constant& operator= (const Constant &other) {
+      this->name = other.name;
+      this->size = other.size;
+      this->alignment = other.alignment;
+      this->offset = other.offset;
+      return *this;
+    }
+    /*! Nothing happens here */
+    INLINE ~Constant(void) {}
+    const std::string& getName(void) const { return name; }
+    uint32_t getSize (void) const { return size; }
+    uint32_t getAlignment (void) const { return alignment; }
+    uint32_t getOffset(void) const { return offset; }
+  private:
+    std::string name; //!< Optional name of the constant
+    uint32_t size;      //!< Size of the constant
+    uint32_t alignment; //!< Alignment required for each constant
+    uint32_t offset;    //!< Offset of the constant in the data segment
+    GBE_CLASS(Constant);
+  };
+
+  /*! A constant set is a set of immutable data associated to a compilation
+   *  unit
+   */
+  class ConstantSet : public Serializable
+  {
+  public:
+    /*! Append a new constant in the constant set */
+    void append(const char*, const std::string&, uint32_t size, uint32_t alignment);
+    /*! Number of constants */
+    size_t getConstantNum(void) const { return constants.size(); }
+    /*! Get a special constant */
+    Constant& getConstant(size_t i) { return constants[i]; }
+    /*! Get a special constant */
+    Constant& getConstant(const std::string & name) {
+      for (auto & c : constants) {
+        if (c.getName() == name)
+          return c;
+      }
+      GBE_ASSERT(false);
+      return *(Constant *)nullptr;
+    }
+    /*! Number of bytes of serialized constant data */
+    size_t getDataSize(void) const { return data.size(); }
+    /*! Store serialized constant data into an array */
+    void getData(char *mem) const {
+      for (size_t i = 0; i < data.size(); i ++)
+        mem[i] = data[i];
+    }
+    ConstantSet() {}
+    ConstantSet(const ConstantSet& other) : Serializable(other),
+                data(other.data), constants(other.constants) {}
+    ConstantSet & operator = (const ConstantSet& other) {
+      if (&other != this) {
+        data = other.data;
+        constants = other.constants;
+      }
+      return *this;
+    }
+
+    static const uint32_t magic_begin = TO_MAGIC('C', 'N', 'S', 'T');
+    static const uint32_t magic_end = TO_MAGIC('T', 'S', 'N', 'C');
+
+    /* format:
+       magic_begin     |
+       const_data_size |
+       const_data      |
+       constant_1_size |
+       constant_1      |
+       ........        |
+       constant_n_size |
+       constant_n      |
+       magic_end       |
+       total_size
+    */
+
+    /*! Implements the serialization. */
+    virtual size_t serializeToBin(std::ostream& outs);
+    virtual size_t deserializeFromBin(std::istream& ins);
+
+  private:
+    vector<char> data;         //!< The constant data serialized in one array
+    vector<Constant> constants;//!< Each constant description
+    GBE_CLASS(ConstantSet);
+  };
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_CONSTANT_HPP__ */
+
diff --git a/backend/src/ir/context.cpp b/backend/src/ir/context.cpp
new file mode 100644
index 0000000..1528a8d
--- /dev/null
+++ b/backend/src/ir/context.cpp
@@ -0,0 +1,182 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file context.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/context.hpp"
+#include "ir/unit.hpp"
+#include "ir/lowering.hpp"
+
+namespace gbe {
+namespace ir {
+
+  Context::Context(Unit &unit) :
+    unit(unit), fn(NULL), bb(NULL), usedLabels(NULL) {}
+
+  Context::~Context(void) {
+    for (const auto &elem : fnStack) GBE_SAFE_DELETE(elem.usedLabels);
+    GBE_SAFE_DELETE(usedLabels);
+  }
+
+  Function &Context::getFunction(void) {
+    GBE_ASSERTM(fn != NULL, "No function currently defined");
+    return *fn;
+  }
+
+  void Context::appendPushedConstant(Register reg, const PushLocation &pushed)
+  {
+    GBE_ASSERTM(fn != NULL, "No function currently defined");
+    GBE_ASSERTM(fn->pushMap.contains(reg) == false, "Register already pushed");
+    fn->pushMap.insert(std::make_pair(reg, pushed));
+    fn->locationMap.insert(std::make_pair(pushed, reg));
+  }
+
+  void Context::startFunction(const std::string &name) {
+    fnStack.push_back(StackElem(fn,bb,usedLabels));
+    fn = unit.newFunction(name);
+    usedLabels = GBE_NEW_NO_ARG(vector<uint8_t>);
+    bb = NULL;
+  }
+
+  void Context::endFunction(void) {
+    GBE_ASSERTM(fn != NULL, "No function to end");
+    GBE_ASSERT(fnStack.size() != 0);
+    GBE_ASSERT(usedLabels != NULL);
+
+    // Empty function -> append a return
+    if (fn->blockNum() == 0) this->RET();
+
+    // Check first that all branch instructions point to valid labels
+    GBE_ASSERT(usedLabels);
+#if GBE_DEBUG
+    for (auto usage : *usedLabels)
+      GBE_ASSERTM(usage != LABEL_IS_POINTED, "A label is used and not defined");
+#endif /* GBE_DEBUG */
+    GBE_DELETE(usedLabels);
+
+    // Remove all returns and insert one unique return block at the end of the
+    // function
+    lowerReturn(unit, fn->getName());
+    // check if there is empty labels at first
+    fn->checkEmptyLabels();
+    // Properly order labels and compute the CFG, it's needed by FunctionArgumentLower
+    fn->sortLabels();
+    fn->computeCFG();
+
+    // Spill function argument to the stack if required and identify which
+    // function arguments can use constant push
+    lowerFunctionArguments(unit, fn->getName());
+
+    const StackElem elem = fnStack.back();
+    fnStack.pop_back();
+    fn = elem.fn;
+    bb = elem.bb;
+    usedLabels = elem.usedLabels;
+  }
+
+  Register Context::reg(RegisterFamily family, bool uniform) {
+    GBE_ASSERTM(fn != NULL, "No function currently defined");
+    return fn->newRegister(family, uniform);
+  }
+
+  LabelIndex Context::label(void) {
+    GBE_ASSERTM(fn != NULL, "No function currently defined");
+    const LabelIndex index = fn->newLabel();
+    if (index >= usedLabels->size()) {
+      usedLabels->resize(index + 1);
+      (*usedLabels)[index] = 0;
+    }
+    return index;
+  }
+
+  void Context::input(const std::string &name, FunctionArgument::Type type, Register reg,
+                      FunctionArgument::InfoFromLLVM& info, uint32_t elementSize, uint32_t align, unsigned char bti) {
+    GBE_ASSERTM(fn != NULL, "No function currently defined");
+    GBE_ASSERTM(reg < fn->file.regNum(), "Out-of-bound register");
+    FunctionArgument *arg = GBE_NEW(FunctionArgument, type, reg, elementSize, name, align, info, bti);
+    fn->args.push_back(arg);
+  }
+
+  void Context::output(Register reg) {
+    GBE_ASSERTM(fn != NULL, "No function currently defined");
+    GBE_ASSERTM(reg < fn->file.regNum(), "Out-of-bound register");
+    fn->outputs.push_back(reg);
+  }
+
+  void Context::startBlock(void) {
+    GBE_ASSERTM(fn != NULL, "No function currently defined");
+    this->bb = GBE_NEW(BasicBlock, *fn);
+    fn->blocks.push_back(bb);
+  }
+
+  void Context::endBlock(void) {
+    this->bb = NULL;
+  }
+
+  void Context::append(const Instruction &insn) {
+    GBE_ASSERTM(fn != NULL, "No function currently defined");
+
+    // Start a new block if this is a label
+    if (insn.isMemberOf<LabelInstruction>() == true) {
+      this->endBlock();
+      this->startBlock();
+      const LabelIndex index = cast<LabelInstruction>(insn).getLabelIndex();
+      GBE_ASSERTM(index < fn->labelNum(), "Out-of-bound label");
+      GBE_ASSERTM(fn->labels[index] == NULL, "Label used in a previous block");
+      fn->labels[index] = bb;
+
+      // Now the label index is properly defined
+      GBE_ASSERT(index < usedLabels->size());
+      (*usedLabels)[index] |= LABEL_IS_DEFINED;
+    }
+    // We create a new label for a new block if the user did not do it
+    else if (bb == NULL) {
+      // this->startBlock();
+      const LabelIndex index = this->label();
+      const Instruction insn = ir::LABEL(index);
+      this->append(insn);
+    }
+
+    // Append the instruction in the stream
+    Instruction *insnPtr = fn->newInstruction(insn);
+    bb->append(*insnPtr);
+#if GBE_DEBUG
+    std::string whyNot;
+    if(getUnit().getValid())
+      GBE_ASSERTM(insnPtr->wellFormed(whyNot), whyNot.c_str());
+#endif /* GBE_DEBUG */
+
+    // Close the current block if this is a branch
+    if (insn.isMemberOf<BranchInstruction>() == true) {
+      // We must book keep the fact that the label is used
+      if (insn.getOpcode() == OP_BRA) {
+        const BranchInstruction &branch = cast<BranchInstruction>(insn);
+        const LabelIndex index = branch.getLabelIndex();
+        GBE_ASSERT(index < usedLabels->size());
+        (*usedLabels)[index] |= LABEL_IS_POINTED;
+      }
+      this->endBlock();
+    }
+  }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
new file mode 100644
index 0000000..cd09413
--- /dev/null
+++ b/backend/src/ir/context.hpp
@@ -0,0 +1,252 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file context.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_CONTEXT_HPP__
+#define __GBE_IR_CONTEXT_HPP__
+
+#include "ir/instruction.hpp"
+#include "ir/function.hpp"
+#include "ir/register.hpp"
+#include "ir/immediate.hpp"
+#include "ir/unit.hpp"
+#include "sys/vector.hpp"
+#include <tuple>
+
+namespace gbe {
+namespace ir {
+
+  /*! A context allows an easy creation of the functions (instruction stream and
+   *  the set of immediates and registers needed for it) and constant arrays
+   */
+  class Context
+  {
+  public:
+    /*! Create a new context for this unit */
+    Context(Unit &unit);
+    /*! Free resources needed by context */
+    virtual ~Context(void);
+    /*! Create a new function "name" */
+    void startFunction(const std::string &name);
+    /*! Close the function */
+    void endFunction(void);
+    /*! Get the current processed unit */
+    INLINE Unit &getUnit(void) { return unit; }
+    /*! Get the current processed function */
+    Function &getFunction(void);
+    /*! Get the current processed block */
+    BasicBlock *getBlock(void) { return bb; }
+    /*! Set the SIMD width of the function */
+    void setSimdWidth(uint32_t width) const {
+      GBE_ASSERT(width == 8 || width == 16);
+      fn->simdWidth = width;
+    }
+    /*! Append a new pushed constant */
+    void appendPushedConstant(Register reg, const PushLocation &pushed);
+    /*! Create a new register with the given family for the current function */
+    Register reg(RegisterFamily family, bool uniform = false);
+    /*! Create a new immediate value */
+    template <typename T> INLINE ImmediateIndex newImmediate(T value) {
+      const Immediate imm(value);
+      return fn->newImmediate(imm);
+    }
+    template <typename T> INLINE ImmediateIndex newImmediate(T value, uint32_t num) {
+      const Immediate imm(value, num);
+      return fn->newImmediate(imm);
+    }
+    /*! Create a new immediate value */
+    INLINE ImmediateIndex newImmediate(vector<ImmediateIndex>indexVector) {
+      vector<const Immediate*> immVector;
+      for( uint32_t i = 0; i < indexVector.size(); i++)
+        immVector.push_back(&fn->getImmediate(indexVector[i]));
+      const Immediate imm(immVector);
+      return fn->newImmediate(imm);
+    }
+    /*! Create an integer immediate value */
+    INLINE ImmediateIndex newIntegerImmediate(int64_t x, Type type) {
+      switch (type) {
+        case TYPE_S8: return this->newImmediate(int8_t(x));
+        case TYPE_U8: return this->newImmediate(uint8_t(x));
+        case TYPE_S16: return this->newImmediate(int16_t(x));
+        case TYPE_U16: return this->newImmediate(uint16_t(x));
+        case TYPE_S32: return this->newImmediate(int32_t(x));
+        case TYPE_U32: return this->newImmediate(uint32_t(x));
+        case TYPE_S64: return this->newImmediate(int64_t(x));
+        case TYPE_U64: return this->newImmediate(uint64_t(x));
+        default: NOT_SUPPORTED; return ImmediateIndex(0);
+      }
+      return ImmediateIndex(0);
+    }
+    INLINE ImmediateIndex newFloatImmediate(float x) {
+      return this->newImmediate(x);
+    }
+    INLINE ImmediateIndex newDoubleImmediate(double x) {
+      return this->newImmediate(x);
+    }
+
+    INLINE ImmediateIndex processImm(ImmOpCode op, ImmediateIndex src, Type type) {
+      const Immediate &imm = fn->getImmediate(src);
+      const Immediate &dstImm = Immediate(op, imm, type);
+      return fn->newImmediate(dstImm);
+    }
+
+    INLINE ImmediateIndex processImm(ImmOpCode op, ImmediateIndex src0,
+                                     ImmediateIndex src1, Type type) {
+      const Immediate &imm0 = fn->getImmediate(src0);
+      const Immediate &imm1 = fn->getImmediate(src1);
+      const Immediate &dstImm = Immediate(op, imm0, imm1, type);
+      return fn->newImmediate(dstImm);
+    }
+
+    /*! Set an immediate value */
+    template <typename T> INLINE void setImmediate(ImmediateIndex index, T value) {
+      const Immediate imm(value);
+      fn->immediates[index] = imm;
+    }
+    /*! Create a new register holding the given value. A LOADI is pushed */
+    template <typename T> INLINE Register immReg(T value) {
+      GBE_ASSERTM(fn != NULL, "No function currently defined");
+      const Immediate imm(value);
+      const ImmediateIndex index = fn->newImmediate(imm);
+      const RegisterFamily family = getFamily(imm.getType());
+      const Register reg = this->reg(family);
+      this->LOADI(imm.getType(), reg, index);
+      return reg;
+    }
+    /*! Create a new label for the current function */
+    LabelIndex label(void);
+    /*! Append a new input register for the function */
+    void input(const std::string &name, FunctionArgument::Type type, Register reg,
+               FunctionArgument::InfoFromLLVM& info, uint32_t elemSz = 0u, uint32_t align = 0, uint8_t bti = 0);
+    /*! Append a new output register for the function */
+    void output(Register reg);
+    /*! Get the immediate value */
+    INLINE Immediate getImmediate(ImmediateIndex index) const {
+      return fn->getImmediate(index);
+    }
+    /*! Append a new tuple */
+    template <typename... Args> INLINE Tuple tuple(Args...args) {
+      GBE_ASSERTM(fn != NULL, "No function currently defined");
+      return fn->file.appendTuple(args...);
+    }
+    /*! Make a tuple from an array of register */
+    INLINE Tuple arrayTuple(const Register *reg, uint32_t regNum) {
+      GBE_ASSERTM(fn != NULL, "No function currently defined");
+      return fn->file.appendArrayTuple(reg, regNum);
+    }
+    /*! We just use variadic templates to forward instruction functions */
+#define DECL_INSN(NAME, FAMILY) \
+    template <typename... Args> INLINE void NAME(Args...args);
+#include "ir/instruction.hxx"
+#undef DECL_INSN
+    /*! Return the pointer size handled by the unit */
+    INLINE PointerSize getPointerSize(void) const {
+      return unit.getPointerSize();
+    }
+    /*! Return the family of registers that contain pointer */
+    INLINE RegisterFamily getPointerFamily(void) const {
+      return unit.getPointerFamily();
+    }
+#define DECL_THREE_SRC_INSN(NAME) \
+    INLINE void NAME(Type type, \
+                     Register dst, \
+                     Register src0, \
+                     Register src1, \
+                     Register src2) \
+    { \
+      const Tuple index = this->tuple(src0, src1, src2); \
+      this->NAME(type, dst, index); \
+    }
+    DECL_THREE_SRC_INSN(SEL);
+    DECL_THREE_SRC_INSN(I64MADSAT);
+    DECL_THREE_SRC_INSN(MAD);
+#undef DECL_THREE_SRC_INSN
+
+    /*! For all unary functions */
+    void ALU1(Opcode opcode, Type type, Register dst, Register src) {
+      const Instruction insn = gbe::ir::ALU1(opcode, type, dst, src);
+      this->append(insn);
+    }
+
+    /*! LOAD with the destinations directly specified */
+    template <typename... Args>
+    void LOAD(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
+    {
+      const Tuple index = this->tuple(values...);
+      const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
+      GBE_ASSERT(valueNum > 0);
+      this->LOAD(type, index, offset, space, valueNum, dwAligned, bti);
+    }
+
+    /*! STORE with the sources directly specified */
+    template <typename... Args>
+    void STORE(Type type, Register offset, AddressSpace space, bool dwAligned, BTI bti, Args...values)
+    {
+      const Tuple index = this->tuple(values...);
+      const uint16_t valueNum = std::tuple_size<std::tuple<Args...>>::value;
+      GBE_ASSERT(valueNum > 0);
+      this->STORE(type, index, offset, space, valueNum, dwAligned, bti);
+    }
+    void appendSurface(uint8_t bti, Register reg) { fn->appendSurface(bti, reg); }
+
+  protected:
+    /*! A block must be started with a label */
+    void startBlock(void);
+    /*! A block must be ended with a branch */
+    void endBlock(void);
+    /*! Append the instruction in the current basic block */
+    void append(const Instruction &insn);
+    Unit &unit;                 //!< A unit is associated to a contect
+    Function *fn;               //!< Current function we are processing
+    BasicBlock *bb;             //!< Current basic block we are filling
+    static const uint8_t LABEL_IS_POINTED = 1 << 0; //!< Branch is using it
+    static const uint8_t LABEL_IS_DEFINED = 1 << 1; //!< Label is defining it
+    vector<uint8_t> *usedLabels;
+    /*! Functions can be defined recursiely */
+    struct StackElem {
+      INLINE StackElem(Function *fn, BasicBlock *bb, vector<uint8_t> *usedLabels)
+        : fn(fn), bb(bb), usedLabels(usedLabels)
+      {}
+      Function *fn;                //!< Function to process
+      BasicBlock *bb;              //!< Basic block currently processed
+      vector<uint8_t> *usedLabels; //!< Store all labels that are defined
+    };
+    vector<StackElem> fnStack;     //!< Stack of functions still to finish
+    GBE_CLASS(Context);
+  };
+
+  // Use argument checker to assert argument value correctness
+#define DECL_INSN(NAME, FAMILY) \
+  template <typename... Args> \
+  INLINE void Context::NAME(Args...args) { \
+    GBE_ASSERTM(fn != NULL, "No function currently defined"); \
+    const Instruction insn = gbe::ir::NAME(args...); \
+    this->append(insn); \
+  }
+#include "ir/instruction.hxx"
+#undef DECL_INSN
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_CONTEXT_HPP__ */
+
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
new file mode 100644
index 0000000..85e7934
--- /dev/null
+++ b/backend/src/ir/function.cpp
@@ -0,0 +1,359 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file function.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/function.hpp"
+#include "ir/unit.hpp"
+#include "sys/map.hpp"
+
+namespace gbe {
+namespace ir {
+
+  ///////////////////////////////////////////////////////////////////////////
+  // PushLocation
+  ///////////////////////////////////////////////////////////////////////////
+
+  Register PushLocation::getRegister(void) const {
+    const Function::LocationMap &locationMap = fn.getLocationMap();
+    GBE_ASSERT(locationMap.contains(*this) == true);
+    return locationMap.find(*this)->second;
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Function
+  ///////////////////////////////////////////////////////////////////////////
+
+  Function::Function(const std::string &name, const Unit &unit, Profile profile) :
+    name(name), unit(unit), profile(profile), simdWidth(0), useSLM(false), slmSize(0), stackSize(0)
+  {
+    initProfile(*this);
+    samplerSet = GBE_NEW(SamplerSet);
+    imageSet = GBE_NEW(ImageSet);
+    printfSet = GBE_NEW(PrintfSet);
+  }
+
+  Function::~Function(void) {
+    for (auto block : blocks) GBE_DELETE(block);
+    for (auto loop : loops) GBE_DELETE(loop);
+    for (auto arg : args) GBE_DELETE(arg);
+  }
+
+  RegisterFamily Function::getPointerFamily(void) const {
+    return unit.getPointerFamily();
+  }
+
+  void Function::addLoop(const vector<LabelIndex> &bbs, const vector<std::pair<LabelIndex, LabelIndex>> &exits) {
+    loops.push_back(GBE_NEW(Loop, bbs, exits));
+  }
+
+  void Function::checkEmptyLabels(void) {
+    // Empty label map, we map the removed label to the next label.
+    map<LabelIndex, LabelIndex> labelMap;
+    map<LabelIndex, LabelIndex> revLabelMap;
+    foreachBlock([&](BasicBlock &BB) {
+      Instruction * insn = BB.getLastInstruction();
+      if (insn->getOpcode() == OP_LABEL) {
+        GBE_ASSERTM(0, "Found empty block. ");
+      }
+    });
+  }
+
+  void Function::sortLabels(void) {
+    uint32_t last = 0;
+
+    // Compute the new labels and patch the label instruction
+    map<LabelIndex, LabelIndex> labelMap;
+    foreachInstruction([&](Instruction &insn) {
+      if (insn.getOpcode() != OP_LABEL) return;
+
+      // Create the new label
+      const Instruction newLabel = LABEL(LabelIndex(last));
+
+      // Replace the previous label instruction
+      LabelInstruction &label = cast<LabelInstruction>(insn);
+      const LabelIndex index = label.getLabelIndex();
+      labelMap.insert(std::make_pair(index, LabelIndex(last++)));
+      newLabel.replace(&insn);
+    });
+
+    // Patch all branch instructions with the new labels
+    foreachInstruction([&](Instruction &insn) {
+      if (insn.getOpcode() != OP_BRA) return;
+
+      // Get the current branch instruction
+      BranchInstruction &bra = cast<BranchInstruction>(insn);
+      const LabelIndex index = bra.getLabelIndex();
+      const LabelIndex newIndex = labelMap.find(index)->second;
+
+      // Insert the patched branch instruction
+      if (bra.isPredicated() == true) {
+        const Instruction newBra = BRA(newIndex, bra.getPredicateIndex());
+        newBra.replace(&insn);
+      } else {
+        const Instruction newBra = BRA(newIndex);
+        newBra.replace(&insn);
+      }
+    });
+
+    // fix labels for loops
+    for (auto &x : loops) {
+      for (auto &y : x->bbs)
+        y = labelMap[y];
+
+      for (auto &z : x->exits) {
+        z.first = labelMap[z.first];
+        z.second = labelMap[z.second];
+      }
+    }
+
+    // Reset the label to block mapping
+    this->labels.resize(last);
+    foreachBlock([&](BasicBlock &bb) {
+      const Instruction *first = bb.getFirstInstruction();
+      const LabelInstruction *label = cast<LabelInstruction>(first);
+      const LabelIndex index = label->getLabelIndex();
+      this->labels[index] = &bb;
+    });
+  }
+
+  LabelIndex Function::newLabel(void) {
+    GBE_ASSERTM(labels.size() < 0xffff,
+                "Too many labels are defined (65536 only are supported)");
+    const LabelIndex index(labels.size());
+    labels.push_back(NULL);
+    return index;
+  }
+
+  void Function::outImmediate(std::ostream &out, ImmediateIndex index) const {
+    GBE_ASSERT(index < immediates.size());
+    const Immediate imm = immediates[index];
+    switch (imm.getType()) {
+      case TYPE_BOOL: out << !!imm.getIntegerValue(); break;
+      case TYPE_S8:
+      case TYPE_U8:
+      case TYPE_S16:
+      case TYPE_U16:
+      case TYPE_S32:
+      case TYPE_U32:
+      case TYPE_S64: out << imm.getIntegerValue(); break;
+      case TYPE_U64: out << (uint64_t)imm.getIntegerValue(); break;
+      case TYPE_HALF: out << "half(" << imm.getIntegerValue() << ")"; break;
+      case TYPE_FLOAT: out << imm.getFloatValue(); break;
+      case TYPE_DOUBLE: out << imm.getDoubleValue(); break;
+      default:
+        GBE_ASSERT(0 && "unsupported imm type.\n");
+    }
+  }
+
+  uint32_t Function::getLargestBlockSize(void) const {
+    uint32_t insnNum = 0;
+    foreachBlock([&insnNum](const ir::BasicBlock &bb) {
+      insnNum = std::max(insnNum, uint32_t(bb.size()));
+    });
+    return insnNum;
+  }
+
+  uint32_t Function::getFirstSpecialReg(void) const {
+    return this->profile == PROFILE_OCL ? 0u : ~0u;
+  }
+
+  uint32_t Function::getSpecialRegNum(void) const {
+    return this->profile == PROFILE_OCL ? ocl::regNum : ~0u;
+  }
+
+  bool Function::isEntryBlock(const BasicBlock &bb) const {
+    if (this->blockNum() == 0)
+      return false;
+    else
+      return &bb == this->blocks[0];
+  }
+
+  const BasicBlock &Function::getTopBlock(void) const {
+    GBE_ASSERT(blockNum() > 0 && blocks[0] != NULL);
+    return *blocks[0];
+  }
+
+  const BasicBlock &Function::getBottomBlock(void) const {
+    const uint32_t n = blockNum();
+    GBE_ASSERT(n > 0 && blocks[n-1] != NULL);
+    return *blocks[n-1];
+  }
+
+  BasicBlock &Function::getBottomBlock(void) {
+    const uint32_t n = blockNum();
+    GBE_ASSERT(n > 0 && blocks[n-1] != NULL);
+    return *blocks[n-1];
+  }
+
+  const BasicBlock &Function::getBlock(LabelIndex label) const {
+    GBE_ASSERT(label < labelNum() && labels[label] != NULL);
+    return *labels[label];
+  }
+
+  const LabelInstruction *Function::getLabelInstruction(LabelIndex index) const {
+    const BasicBlock *bb = this->labels[index];
+    const Instruction *first = bb->getFirstInstruction();
+    return cast<LabelInstruction>(first);
+  }
+
+  /*! Indicate if the given register is a special one (like localID in OCL) */
+  bool Function::isSpecialReg(const Register &reg) const {
+    const uint32_t ID = uint32_t(reg);
+    const uint32_t firstID = this->getFirstSpecialReg();
+    const uint32_t specialNum = this->getSpecialRegNum();
+    return ID >= firstID && ID < firstID + specialNum;
+  }
+  Register Function::getSurfaceBaseReg(uint8_t bti) const {
+    map<uint8_t, Register>::const_iterator iter = btiRegMap.find(bti);
+    GBE_ASSERT(iter != btiRegMap.end());
+    return iter->second;
+  }
+
+  void Function::appendSurface(uint8_t bti, Register reg) {
+    btiRegMap.insert(std::make_pair(bti, reg));
+  }
+
+  void Function::computeCFG(void) {
+    // Clear possible previously computed CFG and compute the direct
+    // predecessors and successors
+    BasicBlock *prev = NULL;
+    this->foreachBlock([this, &prev](BasicBlock &bb) {
+      bb.successors.clear();
+      bb.predecessors.clear();
+      if (prev != NULL) {
+        prev->nextBlock = &bb;
+        bb.prevBlock = prev;
+      }
+      prev = &bb;
+    });
+
+    // Update it. Do not forget that a branch can also jump to the next block
+    BasicBlock *jumpToNext = NULL;
+    this->foreachBlock([this, &jumpToNext](BasicBlock &bb) {
+      if (jumpToNext) {
+        jumpToNext->successors.insert(&bb);
+        bb.predecessors.insert(jumpToNext);
+        jumpToNext = NULL;
+      }
+      if (bb.size() == 0) return;
+      Instruction *last = bb.getLastInstruction();
+      if (last->isMemberOf<BranchInstruction>() == false) {
+        jumpToNext = &bb;
+        return;
+      }
+      const BranchInstruction &insn = cast<BranchInstruction>(*last);
+      if (insn.getOpcode() == OP_BRA) {
+        const LabelIndex label = insn.getLabelIndex();
+        BasicBlock *target = this->blocks[label];
+        GBE_ASSERT(target != NULL);
+        target->predecessors.insert(&bb);
+        bb.successors.insert(target);
+        if ( insn.isPredicated() == true) jumpToNext = &bb;
+      }
+    });
+  }
+
+  std::ostream &operator<< (std::ostream &out, const Function &fn)
+  {
+    out << ".decl_function " << fn.getName() << std::endl;
+    out << fn.getRegisterFile();
+    out << "## " << fn.argNum() << " input register"
+        << (fn.argNum() ? "s" : "") << " ##" << std::endl;
+    for (uint32_t i = 0; i < fn.argNum(); ++i) {
+      const FunctionArgument &input = fn.getArg(i);
+      out << "decl_input.";
+      switch (input.type) {
+        case FunctionArgument::GLOBAL_POINTER: out << "global"; break;
+        case FunctionArgument::LOCAL_POINTER: out << "local"; break;
+        case FunctionArgument::CONSTANT_POINTER: out << "constant"; break;
+        case FunctionArgument::VALUE: out << "value"; break;
+        case FunctionArgument::STRUCTURE:
+          out << "structure." << input.size;
+        break;
+        case FunctionArgument::IMAGE: out << "image"; break;
+        default: break;
+      }
+      out << " %" << input.reg << " " << input.name << std::endl;
+    }
+    out << "## " << fn.outputNum() << " output register"
+        << (fn.outputNum() ? "s" : "") << " ##" << std::endl;
+    for (uint32_t i = 0; i < fn.outputNum(); ++i)
+      out << "decl_output %" << fn.getOutput(i) << std::endl;
+    out << "## " << fn.pushedNum() << " pushed register" << std::endl;
+    const Function::PushMap &pushMap = fn.getPushMap();
+    for (const auto &pushed : pushMap) {
+      out << "decl_pushed %" << pushed.first
+           << " @{" << pushed.second.argID << ","
+           << pushed.second.offset << "}" << std::endl;
+    }
+    out << "## " << fn.blockNum() << " block"
+        << (fn.blockNum() ? "s" : "") << " ##" << std::endl;
+    fn.foreachBlock([&](const BasicBlock &bb) {
+      const_cast<BasicBlock&>(bb).foreach([&out] (const Instruction &insn) {
+        out << insn << std::endl;
+      });
+      out << std::endl;
+    });
+    out << ".end_function" << std::endl;
+    return out;
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Basic Block
+  ///////////////////////////////////////////////////////////////////////////
+
+  BasicBlock::BasicBlock(Function &fn) : fn(fn) {
+    this->nextBlock = this->prevBlock = NULL;
+  }
+
+  BasicBlock::~BasicBlock(void) {
+    this->foreach([this] (Instruction &insn) {
+     this->fn.deleteInstruction(&insn);
+    });
+  }
+
+  void BasicBlock::append(Instruction &insn) {
+    insn.setParent(this);
+    this->push_back(&insn);
+  }
+
+  Instruction *BasicBlock::getFirstInstruction(void) const {
+    GBE_ASSERT(this->begin() != this->end());
+    const Instruction &insn = *this->begin();
+    return const_cast<Instruction*>(&insn);
+  }
+
+  Instruction *BasicBlock::getLastInstruction(void) const {
+    GBE_ASSERT(this->begin() != this->end());
+    const Instruction &insn = *(--this->end());
+    return const_cast<Instruction*>(&insn);
+  }
+
+  LabelIndex BasicBlock::getLabelIndex(void) const {
+    const Instruction *first = this->getFirstInstruction();
+    const LabelInstruction *label = cast<LabelInstruction>(first);
+    return label->getLabelIndex();
+  }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
new file mode 100644
index 0000000..9aa1e8d
--- /dev/null
+++ b/backend/src/ir/function.hpp
@@ -0,0 +1,400 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file function.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_FUNCTION_HPP__
+#define __GBE_IR_FUNCTION_HPP__
+
+#include "ir/immediate.hpp"
+#include "ir/register.hpp"
+#include "ir/instruction.hpp"
+#include "ir/profile.hpp"
+#include "ir/sampler.hpp"
+#include "ir/printf.hpp"
+#include "ir/image.hpp"
+#include "sys/vector.hpp"
+#include "sys/set.hpp"
+#include "sys/map.hpp"
+#include "sys/alloc.hpp"
+
+#include <ostream>
+
+namespace gbe {
+namespace ir {
+
+  /*! Commonly used in the CFG */
+  typedef set<BasicBlock*> BlockSet;
+  class Unit; // Function belongs to a unit
+
+  /*! Function basic blocks really belong to a function since:
+   *  1 - registers used in the basic blocks belongs to the function register
+   *      file
+   *  2 - branches point to basic blocks of the same function
+   */
+  class BasicBlock : public NonCopyable, public intrusive_list<Instruction>
+  {
+  public:
+    /*! Empty basic block */
+    BasicBlock(Function &fn);
+    /*! Releases all the instructions */
+    ~BasicBlock(void);
+    /*! Append a new instruction at the end of the stream */
+    void append(Instruction &insn);
+    /*! Get the parent function */
+    Function &getParent(void) { return fn; }
+    const Function &getParent(void) const { return fn; }
+    /*! Get the next and previous allocated block */
+    BasicBlock *getNextBlock(void) const { return this->nextBlock; }
+    BasicBlock *getPrevBlock(void) const { return this->prevBlock; }
+    /*! Get / set the first and last instructions */
+    Instruction *getFirstInstruction(void) const;
+    Instruction *getLastInstruction(void) const;
+    /*! Get successors and predecessors */
+    const BlockSet &getSuccessorSet(void) const { return successors; }
+    const BlockSet &getPredecessorSet(void) const { return predecessors; }
+    /*! Get the label index of this block */
+    LabelIndex getLabelIndex(void) const;
+    /*! Apply the given functor on all instructions */
+    template <typename T>
+    INLINE void foreach(const T &functor) {
+      auto it = this->begin();
+      while (it != this->end()) {
+        auto curr = it++;
+        functor(*curr);
+      }
+    }
+    set <Register> undefPhiRegs;
+    set <Register> definedPhiRegs;
+  private:
+    friend class Function; //!< Owns the basic blocks
+    BlockSet predecessors; //!< Incoming blocks
+    BlockSet successors;   //!< Outgoing blocks
+    BasicBlock *nextBlock; //!< Block allocated just after this one
+    BasicBlock *prevBlock; //!< Block allocated just before this one
+    Function &fn;          //!< Function the block belongs to
+    GBE_CLASS(BasicBlock);
+  };
+
+  /*! In fine, function input arguments can be pushed from the constant
+   *  buffer if they are structures. Other arguments can be images (textures)
+   *  and will also require special treatment.
+   */
+  struct FunctionArgument {
+    enum Type {
+      GLOBAL_POINTER    = 0, // __global
+      CONSTANT_POINTER  = 1, // __constant
+      LOCAL_POINTER     = 2, // __local
+      VALUE             = 3, // int, float
+      STRUCTURE         = 4, // struct foo
+      IMAGE             = 5,  // image*d_t
+      SAMPLER           = 6
+    };
+
+    struct InfoFromLLVM { // All the info about passed by llvm, using -cl-kernel-arg-info
+      uint32_t addrSpace;
+      std::string typeName;
+      std::string accessQual;
+      std::string typeQual;
+      std::string argName; // My different from arg->getName()
+    };
+
+    /*! Create a function input argument */
+    INLINE FunctionArgument(Type type, Register reg, uint32_t size, const std::string &name, uint32_t align, InfoFromLLVM& info, uint8_t bti) :
+      type(type), reg(reg), size(size), align(align), name(name), info(info), bti(bti) { }
+
+    Type type;     //!< Gives the type of argument we have
+    Register reg;  //!< Holds the argument
+    uint32_t size; //!< == sizeof(void*) for ptr, sizeof(elem) for the rest
+    uint32_t align; //!< address alignment for the argument
+    const std::string name; //!< Holds the function name for IR output
+    InfoFromLLVM info;  //!< Holds the llvm passed info
+    uint8_t bti; //!< binding table index
+    GBE_STRUCT(FunctionArgument); // Use custom allocator
+  };
+
+  /*! Maps the pushed register to the function argument */
+  struct PushLocation {
+    INLINE PushLocation(const Function &fn, uint32_t argID, uint32_t offset) :
+      fn(fn), argID(argID), offset(offset) {}
+    /*! Get the pushed virtual register */
+    Register getRegister(void) const;
+    const Function &fn;       //!< Function it belongs to
+    uint32_t argID;           //!< Function argument
+    uint32_t offset;          //!< Offset in the function argument
+    GBE_STRUCT(PushLocation); // Use custom allocator
+  };
+
+  /*! For maps and sets */
+  INLINE bool operator< (const PushLocation &arg0, const PushLocation &arg1) {
+    if (arg0.argID != arg1.argID) return arg0.argID < arg1.argID;
+    return arg0.offset < arg1.offset;
+  }
+
+  /*! CFG loops */
+  struct Loop : public NonCopyable
+  {
+  public:
+    Loop(const vector<LabelIndex> &in, const vector<std::pair<LabelIndex, LabelIndex>> &exit) :
+    bbs(in), exits(exit) {}
+    vector<LabelIndex> bbs;
+    vector<std::pair<LabelIndex, LabelIndex>> exits;
+    GBE_STRUCT(Loop);
+  };
+
+  /*! A function is :
+   *  - a register file
+   *  - a set of basic block layout into a CGF
+   *  - input arguments
+   */
+  class Function : public NonCopyable
+  {
+  public:
+    /*! Map of all pushed registers */
+    typedef map<Register, PushLocation> PushMap;
+    /*! Map of all pushed location (i.e. part of function argument) */
+    typedef map<PushLocation, Register> LocationMap;
+    /*! Create an empty function */
+    Function(const std::string &name, const Unit &unit, Profile profile = PROFILE_OCL);
+    /*! Release everything *including* the basic block pointers */
+    ~Function(void);
+    /*! Get the function profile */
+    INLINE Profile getProfile(void) const { return profile; }
+    /*! Get a new valid register */
+    INLINE Register newRegister(RegisterFamily family, bool uniform = false) {
+      return this->file.append(family, uniform);
+    }
+    /*! Get the function name */
+    const std::string &getName(void) const { return name; }
+    /*! When set, we do not have choice any more in the back end for it */
+    INLINE void setSimdWidth(uint32_t width) { simdWidth = width; }
+    /*! Get the SIMD width (0 if not forced) */
+    uint32_t getSimdWidth(void) const { return simdWidth; }
+    /*! Extract the register from the register file */
+    INLINE RegisterData getRegisterData(Register reg) const { return file.get(reg); }
+    /*! set a register to uniform or nonuniform type. */
+    INLINE void setRegisterUniform(Register reg, bool uniform) { file.setUniform(reg, uniform); }
+    /*! return true if the specified regsiter is uniform type */
+    INLINE bool isUniformRegister(Register reg) { return file.isUniform(reg); }
+    /*! Get the register family from the register itself */
+    INLINE RegisterFamily getRegisterFamily(Register reg) const {
+      return this->getRegisterData(reg).family;
+    }
+    /*! Get the register from the tuple vector */
+    INLINE Register getRegister(Tuple ID, uint32_t which) const {
+      return file.get(ID, which);
+    }
+    /*! Set the register from the tuple vector */
+    INLINE void setRegister(Tuple ID, uint32_t which, Register reg) {
+      file.set(ID, which, reg);
+    }
+    /*! Get the register file */
+    INLINE const RegisterFile &getRegisterFile(void) const { return file; }
+    /*! Get the given value ie immediate from the function */
+    INLINE const Immediate &getImmediate(ImmediateIndex ID) const {
+      return immediates[ID];
+    }
+    /*! Create a new immediate and returns its index */
+    INLINE ImmediateIndex newImmediate(const Immediate &imm) {
+      const ImmediateIndex index(this->immediateNum());
+      this->immediates.push_back(imm);
+      return index;
+    }
+    /*! Fast allocation / deallocation of instructions */
+    DECL_POOL(Instruction, insnPool);
+    /*! Get input argument */
+    INLINE const FunctionArgument &getArg(uint32_t ID) const {
+      GBE_ASSERT(args[ID] != NULL);
+      return *args[ID];
+    }
+    INLINE FunctionArgument &getArg(uint32_t ID) {
+      GBE_ASSERT(args[ID] != NULL);
+      return *args[ID];
+    }
+
+    /*! Get arg ID. */
+    INLINE int32_t getArgID(FunctionArgument *requestArg) {
+      for (uint32_t ID = 0; ID < args.size(); ID++)
+      {
+        if ( args[ID] == requestArg )
+          return ID;
+      }
+      GBE_ASSERTM(0, "Failed to get a valid argument ID.");
+      return -1;
+    }
+
+    /*! Get the number of pushed registers */
+    INLINE uint32_t pushedNum(void) const { return pushMap.size(); }
+    /*! Get the pushed data location for the given register */
+    INLINE const PushLocation *getPushLocation(Register reg) const {
+      auto it = pushMap.find(reg);
+      if (it == pushMap.end())
+        return NULL;
+      else
+        return &it->second;
+    }
+    /*! Get the map of pushed registers */
+    const PushMap &getPushMap(void) const { return this->pushMap; }
+    /*! Get the map of pushed registers */
+    const LocationMap &getLocationMap(void) const { return this->locationMap; }
+    /*! Get input argument from the register (linear research). Return NULL if
+     *  this is not an input argument
+     */
+    INLINE const FunctionArgument *getArg(const Register &reg) const {
+      for (auto arg : args) if (arg->reg == reg) return arg;
+      return NULL;
+    }
+
+    INLINE FunctionArgument *getArg(const Register &reg) {
+      for (auto arg : args) if (arg->reg == reg) return arg;
+      return NULL;
+    }
+
+    /*! Get output register */
+    INLINE Register getOutput(uint32_t ID) const { return outputs[ID]; }
+    /*! Get the argument location for the pushed register */
+    INLINE const PushLocation &getPushLocation(Register reg) {
+      GBE_ASSERT(pushMap.contains(reg) == true);
+      return pushMap.find(reg)->second;
+    }
+    /*! Says if this is the top basic block (entry point) */
+    bool isEntryBlock(const BasicBlock &bb) const;
+    /*! Get function the entry point block */
+    const BasicBlock &getTopBlock(void) const;
+    /*! Get the last block */
+    const BasicBlock &getBottomBlock(void) const;
+    /*! Get the last block */
+    BasicBlock &getBottomBlock(void);
+    /*! Get block from its label */
+    const BasicBlock &getBlock(LabelIndex label) const;
+    /*! Get the label instruction from its label index */
+    const LabelInstruction *getLabelInstruction(LabelIndex index) const;
+    /*! Return the number of instructions of the largest basic block */
+    uint32_t getLargestBlockSize(void) const;
+    /*! Get the first index of the special registers and number of them */
+    uint32_t getFirstSpecialReg(void) const;
+    uint32_t getSpecialRegNum(void) const;
+    /*! Indicate if the given register is a special one (like localID in OCL) */
+    bool isSpecialReg(const Register &reg) const;
+    /*! Create a new label (still not bound to a basic block) */
+    LabelIndex newLabel(void);
+    /*! Create the control flow graph */
+    void computeCFG(void);
+    /*! Sort labels in increasing orders (top block has the smallest label) */
+    void sortLabels(void);
+    /*! check empty Label. */
+    void checkEmptyLabels(void);
+    /*! Get the pointer family */
+    RegisterFamily getPointerFamily(void) const;
+    /*! Number of registers in the register file */
+    INLINE uint32_t regNum(void) const { return file.regNum(); }
+    /*! Number of register tuples in the register file */
+    INLINE uint32_t tupleNum(void) const { return file.tupleNum(); }
+    /*! Number of labels in the function */
+    INLINE uint32_t labelNum(void) const { return labels.size(); }
+    /*! Number of immediate values in the function */
+    INLINE uint32_t immediateNum(void) const { return immediates.size(); }
+    /*! Get the number of argument register */
+    INLINE uint32_t argNum(void) const { return args.size(); }
+    /*! Get the number of output register */
+    INLINE uint32_t outputNum(void) const { return outputs.size(); }
+    /*! Number of blocks in the function */
+    INLINE uint32_t blockNum(void) const { return blocks.size(); }
+    /*! Output an immediate value in a stream */
+    void outImmediate(std::ostream &out, ImmediateIndex index) const;
+    /*! Apply the given functor on all basic blocks */
+    template <typename T>
+    INLINE void foreachBlock(const T &functor) const {
+      for (auto block : blocks) functor(*block);
+    }
+    /*! Apply the given functor on all instructions */
+    template <typename T>
+    INLINE void foreachInstruction(const T &functor) const {
+      for (auto block : blocks) block->foreach(functor);
+    }
+    /*! Does it use SLM */
+    INLINE bool getUseSLM(void) const { return this->useSLM; }
+    /*! Change the SLM config for the function */
+    INLINE bool setUseSLM(bool useSLM) { return this->useSLM = useSLM; }
+    /*! get SLM size needed for local variable inside kernel function */
+    INLINE uint32_t getSLMSize(void) const { return this->slmSize; }
+    /*! set slm size needed for local variable inside kernel function */
+    INLINE void setSLMSize(uint32_t size) { this->slmSize = size; }
+    /*! Get sampler set in this function */
+    SamplerSet* getSamplerSet(void) const {return samplerSet; }
+    /*! Get image set in this function */
+    ImageSet* getImageSet(void) const {return imageSet; }
+    /*! Get printf set in this function */
+    PrintfSet* getPrintfSet(void) const {return printfSet; }
+    /*! Set required work group size. */
+    void setCompileWorkGroupSize(size_t x, size_t y, size_t z) { compileWgSize[0] = x; compileWgSize[1] = y; compileWgSize[2] = z; }
+    /*! Get required work group size. */
+    const size_t *getCompileWorkGroupSize(void) const {return compileWgSize;}
+    /*! Set function attributes string. */
+    void setFunctionAttributes(const std::string& functionAttributes) {  this->functionAttributes= functionAttributes; }
+    /*! Get function attributes string. */
+    const std::string& getFunctionAttributes(void) const {return this->functionAttributes;}
+    /*! Get stack size. */
+    INLINE const uint32_t getStackSize(void) const { return this->stackSize; }
+    /*! Push stack size. */
+    INLINE void pushStackSize(uint32_t step) { this->stackSize += step; }
+    /*! add the loop info for later liveness analysis */
+    void addLoop(const vector<LabelIndex> &bbs, const vector<std::pair<LabelIndex, LabelIndex>> &exits);
+    INLINE const vector<Loop * > &getLoops() { return loops; }
+    /*! Get surface starting address register from bti */
+    Register getSurfaceBaseReg(uint8_t bti) const;
+    void appendSurface(uint8_t bti, Register reg);
+  private:
+    friend class Context;           //!< Can freely modify a function
+    std::string name;               //!< Function name
+    const Unit &unit;               //!< Function belongs to this unit
+    vector<FunctionArgument*> args; //!< Input registers of the function
+    vector<Register> outputs;       //!< Output registers of the function
+    vector<BasicBlock*> labels;     //!< Each label points to a basic block
+    vector<Immediate> immediates;   //!< All immediate values in the function
+    vector<BasicBlock*> blocks;     //!< All chained basic blocks
+    vector<Loop *> loops;           //!< Loops info of the function
+    map<uint8_t, Register> btiRegMap;//!< map bti to surface base address
+    RegisterFile file;              //!< RegisterDatas used by the instructions
+    Profile profile;                //!< Current function profile
+    PushMap pushMap;                //!< Pushed function arguments (reg->loc)
+    LocationMap locationMap;        //!< Pushed function arguments (loc->reg)
+    uint32_t simdWidth;             //!< 8 or 16 if forced, 0 otherwise
+    bool useSLM;                    //!< Is SLM required?
+    uint32_t slmSize;               //!< local variable size inside kernel function
+    uint32_t stackSize;             //!< stack size for private memory.
+    SamplerSet *samplerSet;         //!< samplers used in this function.
+    ImageSet* imageSet;             //!< Image set in this function's arguments..
+    PrintfSet *printfSet;           //!< printfSet store the printf info.
+    size_t compileWgSize[3];        //!< required work group size specified by
+                                    //   __attribute__((reqd_work_group_size(X, Y, Z))).
+    std::string functionAttributes; //!< function attribute qualifiers combined.
+    GBE_CLASS(Function);            //!< Use custom allocator
+  };
+
+  /*! Output the function string in the given stream */
+  std::ostream &operator<< (std::ostream &out, const Function &fn);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_FUNCTION_HPP__ */
+
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp
new file mode 100644
index 0000000..a9b1563
--- /dev/null
+++ b/backend/src/ir/image.cpp
@@ -0,0 +1,278 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file image.cpp
+ *
+ */
+#include "image.hpp"
+#include "context.hpp"
+#include "ocl_common_defines.h"
+#include "backend/program.h"
+
+namespace gbe {
+namespace ir {
+
+  static uint32_t getInfoOffset4Type(struct ImageInfo *imageInfo, int type)
+  {
+    switch (type) {
+      case GetImageInfoInstruction::WIDTH:              return imageInfo->wSlot;
+      case GetImageInfoInstruction::HEIGHT:             return imageInfo->hSlot;
+      case GetImageInfoInstruction::DEPTH:              return imageInfo->depthSlot;
+      case GetImageInfoInstruction::CHANNEL_DATA_TYPE:  return imageInfo->dataTypeSlot;
+      case GetImageInfoInstruction::CHANNEL_ORDER:      return imageInfo->channelOrderSlot;
+      default:
+        NOT_IMPLEMENTED;
+    }
+    return 0;
+  }
+
+  static uint32_t setInfoOffset4Type(struct ImageInfo *imageInfo, int type, uint32_t offset)
+  {
+    switch (type) {
+      case GetImageInfoInstruction::WIDTH:              imageInfo->wSlot = offset; break;
+      case GetImageInfoInstruction::HEIGHT:             imageInfo->hSlot = offset; break;
+      case GetImageInfoInstruction::DEPTH:              imageInfo->depthSlot = offset; break;
+      case GetImageInfoInstruction::CHANNEL_DATA_TYPE:  imageInfo->dataTypeSlot = offset; break;
+      case GetImageInfoInstruction::CHANNEL_ORDER:      imageInfo->channelOrderSlot = offset; break;
+      default:
+        NOT_IMPLEMENTED;
+    }
+    return 0;
+  }
+
+  void ImageSet::appendInfo(ImageInfoKey key, uint32_t offset)
+  {
+    auto it = indexMap.find(key.index);
+    assert(it != indexMap.end());
+    struct ImageInfo *imageInfo = it->second;
+    setInfoOffset4Type(imageInfo, key.type, offset);
+  }
+
+  void ImageSet::clearInfo()
+  {
+    struct ImageInfo *imageInfo;
+    for(auto &it : indexMap) {
+      imageInfo = it.second;
+      imageInfo->wSlot = -1;
+      imageInfo->hSlot = -1;
+      imageInfo->depthSlot = -1;
+      imageInfo->dataTypeSlot = -1;
+      imageInfo->channelOrderSlot = -1;
+    }
+  }
+
+  const int32_t ImageSet::getInfoOffset(ImageInfoKey key) const
+  {
+    auto it = indexMap.find(key.index);
+    if (it == indexMap.end())
+      return -1;
+    struct ImageInfo *imageInfo = it->second;
+    return getInfoOffset4Type(imageInfo, key.type);
+  }
+
+  const uint32_t ImageSet::getIdx(const Register imageReg) const
+  {
+    auto it = regMap.find(imageReg);
+    GBE_ASSERT(it != regMap.end());
+    return it->second->idx;
+  }
+
+  void ImageSet::getData(struct ImageInfo *imageInfos) const {
+      int id = 0;
+      for(auto &it : regMap)
+        imageInfos[id++] = *it.second;
+  }
+
+  ImageSet::~ImageSet() {
+    for(auto &it : regMap)
+      GBE_DELETE(it.second);
+  }
+
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+  /*! Implements the serialization. */
+  size_t ImageSet::serializeToBin(std::ostream& outs) {
+    size_t ret_size = 0;
+
+    OUT_UPDATE_SZ(magic_begin);
+
+    OUT_UPDATE_SZ(regMap.size());
+    for (auto iter : regMap) {
+      OUT_UPDATE_SZ(iter.first);
+      OUT_UPDATE_SZ(iter.second->arg_idx);
+      OUT_UPDATE_SZ(iter.second->idx);
+      OUT_UPDATE_SZ(iter.second->wSlot);
+      OUT_UPDATE_SZ(iter.second->hSlot);
+      OUT_UPDATE_SZ(iter.second->depthSlot);
+      OUT_UPDATE_SZ(iter.second->dataTypeSlot);
+      OUT_UPDATE_SZ(iter.second->channelOrderSlot);
+      OUT_UPDATE_SZ(iter.second->dimOrderSlot);
+    }
+
+    OUT_UPDATE_SZ(indexMap.size());
+    for (auto iter : indexMap) {
+      OUT_UPDATE_SZ(iter.first);
+      OUT_UPDATE_SZ(iter.second->arg_idx);
+      OUT_UPDATE_SZ(iter.second->idx);
+      OUT_UPDATE_SZ(iter.second->wSlot);
+      OUT_UPDATE_SZ(iter.second->hSlot);
+      OUT_UPDATE_SZ(iter.second->depthSlot);
+      OUT_UPDATE_SZ(iter.second->dataTypeSlot);
+      OUT_UPDATE_SZ(iter.second->channelOrderSlot);
+      OUT_UPDATE_SZ(iter.second->dimOrderSlot);
+    }
+
+    OUT_UPDATE_SZ(magic_end);
+    OUT_UPDATE_SZ(ret_size);
+
+    return ret_size;
+  }
+
+  size_t ImageSet::deserializeFromBin(std::istream& ins) {
+    size_t total_size = 0;
+    uint32_t magic;
+    size_t image_map_sz = 0;
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_begin)
+      return 0;
+
+    IN_UPDATE_SZ(image_map_sz); //regMap
+    for (size_t i = 0; i < image_map_sz; i++) {
+      ir::Register reg;
+      ImageInfo *img_info = GBE_NEW(struct ImageInfo);;
+
+      IN_UPDATE_SZ(reg);
+      IN_UPDATE_SZ(img_info->arg_idx);
+      IN_UPDATE_SZ(img_info->idx);
+      IN_UPDATE_SZ(img_info->wSlot);
+      IN_UPDATE_SZ(img_info->hSlot);
+      IN_UPDATE_SZ(img_info->depthSlot);
+      IN_UPDATE_SZ(img_info->dataTypeSlot);
+      IN_UPDATE_SZ(img_info->channelOrderSlot);
+      IN_UPDATE_SZ(img_info->dimOrderSlot);
+
+      regMap.insert(std::make_pair(reg, img_info));
+    }
+
+    IN_UPDATE_SZ(image_map_sz); //indexMap
+    for (uint32_t i = 0; i < image_map_sz; i++) {
+      uint32_t index;
+      ImageInfo *img_info = GBE_NEW(struct ImageInfo);;
+
+      IN_UPDATE_SZ(index);
+      IN_UPDATE_SZ(img_info->arg_idx);
+      IN_UPDATE_SZ(img_info->idx);
+      IN_UPDATE_SZ(img_info->wSlot);
+      IN_UPDATE_SZ(img_info->hSlot);
+      IN_UPDATE_SZ(img_info->depthSlot);
+      IN_UPDATE_SZ(img_info->dataTypeSlot);
+      IN_UPDATE_SZ(img_info->channelOrderSlot);
+      IN_UPDATE_SZ(img_info->dimOrderSlot);
+
+      indexMap.insert(std::make_pair(img_info->idx, img_info));
+    }
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_end)
+      return 0;
+
+    size_t total_bytes;
+    IN_UPDATE_SZ(total_bytes);
+    if (total_bytes + sizeof(total_size) != total_size)
+      return 0;
+
+    return total_size;
+  }
+
+  void ImageSet::printStatus(int indent, std::ostream& outs) {
+    using namespace std;
+    string spaces = indent_to_str(indent);
+    string spaces_nl = indent_to_str(indent + 4);
+
+    outs << spaces << "------------ Begin ImageSet ------------" << "\n";
+
+    outs << spaces_nl  << "  ImageSet Map: [reg, arg_idx, idx, wSlot, hSlot, depthSlot, "
+                "dataTypeSlot, channelOrderSlot, dimOrderSlot]\n";
+    outs << spaces_nl << "     regMap size: " << regMap.size() << "\n";
+    for (auto iter : regMap) {
+      outs << spaces_nl << "         [" << iter.first << ", "
+           << iter.second->arg_idx << ", "
+           << iter.second->idx << ", "
+           << iter.second->wSlot << ", "
+           << iter.second->hSlot << ", "
+           << iter.second->depthSlot << ", "
+           << iter.second->dataTypeSlot << ", "
+           << iter.second->channelOrderSlot << ", "
+           << iter.second->dimOrderSlot << "]" << "\n";
+   }
+
+   outs << spaces_nl << "  ImageSet Map: [index, arg_idx, idx, wSlot, hSlot, depthSlot, "
+           "dataTypeSlot, channelOrderSlot, dimOrderSlot]\n";
+   outs << spaces_nl << "     regMap size: " << indexMap.size() << "\n";
+   for (auto iter : indexMap) {
+     outs << spaces_nl << "         [" << iter.first << ", "
+          << iter.second->arg_idx << ", "
+          << iter.second->idx << ", "
+          << iter.second->wSlot << ", "
+          << iter.second->hSlot << ", "
+          << iter.second->depthSlot << ", "
+          << iter.second->dataTypeSlot << ", "
+          << iter.second->channelOrderSlot << ", "
+          << iter.second->dimOrderSlot << ", " << "\n";
+   }
+
+   outs << spaces << "------------- End ImageSet -------------" << "\n";
+  }
+
+#ifdef GBE_COMPILER_AVAILABLE
+  Register ImageSet::appendInfo(ImageInfoKey key, Context *ctx)
+  {
+    auto it = infoRegMap.find(key.data);
+    if (it != infoRegMap.end())
+      return it->second;
+    Register reg = ctx->reg(FAMILY_DWORD);
+    infoRegMap.insert(std::make_pair(key.data, reg));
+    return reg;
+  }
+
+  void ImageSet::append(Register imageReg, Context *ctx, uint8_t bti)
+  {
+    ir::FunctionArgument *arg =  ctx->getFunction().getArg(imageReg);
+    GBE_ASSERTM(arg && arg->type == ir::FunctionArgument::IMAGE, "Append an invalid reg to image set.");
+    GBE_ASSERTM(regMap.find(imageReg) == regMap.end(), "Append the same image reg twice.");
+
+    int32_t id = ctx->getFunction().getArgID(arg);
+    struct ImageInfo *imageInfo = GBE_NEW(struct ImageInfo);
+    imageInfo->arg_idx = id;
+    imageInfo->idx = bti;
+    imageInfo->wSlot = -1;
+    imageInfo->hSlot = -1;
+    imageInfo->depthSlot = -1;
+    imageInfo->dataTypeSlot = -1;
+    imageInfo->channelOrderSlot = -1;
+    imageInfo->dimOrderSlot = -1;
+    regMap.insert(std::make_pair(imageReg, imageInfo));
+    indexMap.insert(std::make_pair(imageInfo->idx, imageInfo));
+  }
+#endif
+
+} /* namespace ir */
+} /* namespace gbe */
diff --git a/backend/src/ir/image.hpp b/backend/src/ir/image.hpp
new file mode 100644
index 0000000..b31c7da
--- /dev/null
+++ b/backend/src/ir/image.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file image.hpp
+ *
+ */
+#ifndef __GBE_IR_IMAGE_HPP__
+#define __GBE_IR_IMAGE_HPP__
+
+#include "ir/register.hpp"
+#include "ir/instruction.hpp" // for ImageInfoKey
+#include "sys/map.hpp"
+
+extern "C" {
+  struct ImageInfo;
+}
+
+namespace gbe {
+namespace ir {
+
+  class Context;
+  /*! An image set is a set of images which are defined in kernel args.
+   *  We use this set to gather the images here and allocate a unique index
+   *  for each individual image. And that individual image could be used
+   *  at backend to identify this image's location.
+   */
+  class ImageSet : public Serializable
+  {
+  public:
+    /*! Append an image argument. */
+    void append(Register imageReg, Context *ctx, uint8_t bti);
+    /*! Append an image info slot. */
+    void appendInfo(ImageInfoKey key, uint32_t offset);
+    /*! Append an image info register. */
+    Register appendInfo(ImageInfoKey, Context *ctx);
+    /*! clear image info. */
+    void clearInfo();
+    /*! Get the image's index(actual location). */
+    const uint32_t getIdx(const Register imageReg) const;
+    size_t getDataSize(void) { return regMap.size(); }
+    size_t getDataSize(void) const { return regMap.size(); }
+
+    const int32_t getInfoOffset(ImageInfoKey key) const;
+    void getData(struct ImageInfo *imageInfos) const;
+    void operator = (const ImageSet& other) {
+      regMap.insert(other.regMap.begin(), other.regMap.end());
+    }
+
+    bool empty() const { return regMap.empty(); }
+
+    ImageSet(const ImageSet& other) : regMap(other.regMap.begin(), other.regMap.end()) { }
+    ImageSet() {}
+    ~ImageSet();
+
+    static const uint32_t magic_begin = TO_MAGIC('I', 'M', 'A', 'G');
+    static const uint32_t magic_end = TO_MAGIC('G', 'A', 'M', 'I');
+
+    /* format:
+       magic_begin     |
+       regMap_size     |
+       element_1       |
+       ........        |
+       element_n       |
+       indexMap_size   |
+       element_1       |
+       ........        |
+       element_n       |
+       magic_end       |
+       total_size
+    */
+
+    /*! Implements the serialization. */
+    virtual size_t serializeToBin(std::ostream& outs);
+    virtual size_t deserializeFromBin(std::istream& ins);
+    virtual void printStatus(int indent, std::ostream& outs);
+
+  private:
+    map<Register, struct ImageInfo *> regMap;
+    map<uint32_t, struct ImageInfo *> indexMap;
+    map<uint16_t, Register> infoRegMap;
+    GBE_CLASS(ImageSet);
+  };
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_IMAGE_HPP__ */
diff --git a/backend/src/ir/immediate.cpp b/backend/src/ir/immediate.cpp
new file mode 100644
index 0000000..3a6b9a2
--- /dev/null
+++ b/backend/src/ir/immediate.cpp
@@ -0,0 +1,263 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "immediate.hpp"
+
+using namespace gbe;
+using namespace ir;
+
+#define SCALAR_SAME_TYPE_ASSERT()                           \
+      GBE_ASSERT(this->getType() == right.getType()       && \
+                 this->getElemNum() == right.getElemNum() && \
+                 this->getElemNum() == 1                  && \
+                 this->getType() != TYPE_BOOL);
+
+#define DECLAR_BINARY_ALL_TYPE_OP(OP) \
+    Immediate Immediate::operator OP (const Immediate &right) const { \
+      SCALAR_SAME_TYPE_ASSERT(); \
+      switch (this->getType()) { \
+        default: \
+          GBE_ASSERT(0); \
+        case TYPE_S8:     return Immediate(*this->data.s8 OP *right.data.s8);   \
+        case TYPE_U8:     return Immediate(*this->data.u8 OP *right.data.u8);   \
+        case TYPE_S16:    return Immediate(*this->data.s16 OP *right.data.s16); \
+        case TYPE_U16:    return Immediate(*this->data.u16 OP *right.data.u16); \
+        case TYPE_S32:    return Immediate(*this->data.s32 OP *right.data.s32); \
+        case TYPE_U32:    return Immediate(*this->data.u32 OP *right.data.u32); \
+        case TYPE_S64:    return Immediate(*this->data.s64 OP *right.data.s64); \
+        case TYPE_U64:    return Immediate(*this->data.u64 OP *right.data.u64); \
+        case TYPE_FLOAT:  return Immediate(*this->data.f32 OP *right.data.f32); \
+        case TYPE_DOUBLE: return Immediate(*this->data.f64 OP *right.data.f64); \
+      }\
+      return *this;\
+    }
+
+    DECLAR_BINARY_ALL_TYPE_OP(+)
+    DECLAR_BINARY_ALL_TYPE_OP(-)
+    DECLAR_BINARY_ALL_TYPE_OP(*)
+    DECLAR_BINARY_ALL_TYPE_OP(/)
+
+#undef DECLAR_BINARY_ALL_TYPE_OP
+
+#define DECLAR_BINARY_INT_TYPE_OP(OP) \
+    Immediate Immediate::operator OP (const Immediate &right) const { \
+      SCALAR_SAME_TYPE_ASSERT(); \
+      switch (this->getType()) { \
+        default: \
+          GBE_ASSERT(0); \
+        case TYPE_S8:     return Immediate(*this->data.s8 OP *right.data.s8);   \
+        case TYPE_U8:     return Immediate(*this->data.u8 OP *right.data.u8);   \
+        case TYPE_S16:    return Immediate(*this->data.s16 OP *right.data.s16); \
+        case TYPE_U16:    return Immediate(*this->data.u16 OP *right.data.u16); \
+        case TYPE_S32:    return Immediate(*this->data.s32 OP *right.data.s32); \
+        case TYPE_U32:    return Immediate(*this->data.u32 OP *right.data.u32); \
+        case TYPE_S64:    return Immediate(*this->data.s64 OP *right.data.s64); \
+        case TYPE_U64:    return Immediate(*this->data.u64 OP *right.data.u64); \
+      }\
+      return *this;\
+    }
+    DECLAR_BINARY_INT_TYPE_OP(%)
+    DECLAR_BINARY_INT_TYPE_OP(&)
+    DECLAR_BINARY_INT_TYPE_OP(|)
+    DECLAR_BINARY_INT_TYPE_OP(^)
+#undef DECLAR_BINARY_INT_TYPE_OP
+
+
+#define DECLAR_BINARY_ASHIFT_OP(OP) \
+    Immediate Immediate::operator OP (const Immediate &right) const { \
+      GBE_ASSERT(this->getType() > TYPE_BOOL && this->getType() <= TYPE_U64); \
+      int32_t shift = right.getIntegerValue(); \
+      if (shift == 0) \
+        return *this; \
+      else \
+        switch (this->getType()) { \
+          default: \
+            GBE_ASSERT(0); \
+          case TYPE_S8:  return Immediate((*this->data.s8 OP shift));  \
+          case TYPE_U8:  return Immediate((*this->data.u8 OP shift));  \
+          case TYPE_S16: return Immediate((*this->data.s16 OP shift)); \
+          case TYPE_U16: return Immediate((*this->data.u16 OP shift)); \
+          case TYPE_S32: return Immediate((*this->data.s32 OP shift)); \
+          case TYPE_U32: return Immediate((*this->data.u32 OP shift)); \
+          case TYPE_S64: return Immediate((*this->data.s64 OP shift)); \
+          case TYPE_U64: return Immediate((*this->data.u64 OP shift)); \
+        } \
+    }
+
+    DECLAR_BINARY_ASHIFT_OP(>>)
+    DECLAR_BINARY_ASHIFT_OP(<<)
+
+#undef DECLAR_BINARY_ASHIFT_OP
+    Immediate Immediate::lshr (const Immediate &left, const Immediate &right) {
+      GBE_ASSERT(left.getType() > TYPE_BOOL && left.getType() <= TYPE_U64);
+      int32_t shift = right.getIntegerValue();
+      if (shift == 0)
+        return left;
+      else
+        switch (left.getType()) {
+          default:
+            GBE_ASSERT(0);
+          case TYPE_S8:  
+          case TYPE_U8:  return Immediate((*left.data.u8 >> shift));
+          case TYPE_S16: 
+          case TYPE_U16: return Immediate((*left.data.u16 >> shift));
+          case TYPE_S32: 
+          case TYPE_U32: return Immediate((*left.data.u32 >> shift));
+          case TYPE_S64: 
+          case TYPE_U64: return Immediate((*left.data.u64 >> shift));
+        }
+    }
+
+    Immediate::Immediate(ImmOpCode op, const Immediate &left, const Immediate &right, Type dstType) {
+      switch (op) {
+        default:
+          GBE_ASSERT(0 && "unsupported imm op\n");
+        case IMM_ADD: *this = left + right; break;
+        case IMM_SUB: *this = left - right; break;
+        case IMM_MUL: *this = left * right; break;
+        case IMM_DIV: *this = left / right; break;
+        case IMM_AND: *this = left & right; break;
+        case IMM_OR:  *this = left | right; break;
+        case IMM_XOR: *this = left ^ right; break;
+        case IMM_REM:
+        {
+          if (left.getType() > TYPE_BOOL && left.getType() <= TYPE_U64)
+            *this = left % right;
+          else if (left.getType() == TYPE_FLOAT && right.getType() == TYPE_FLOAT) {
+            *this = Immediate(left);
+            *this->data.f32 = fmodf(left.getFloatValue(), right.getFloatValue());
+          }
+          else if (left.getType() == TYPE_DOUBLE && right.getType() == TYPE_DOUBLE) {
+            *this = Immediate(left);
+            *this->data.f64 += fmod(left.getDoubleValue(), right.getDoubleValue());
+          }
+          else
+            GBE_ASSERT(0);
+          break;
+        }
+        case IMM_LSHR:
+        {
+          if (left.getElemNum() == 1)
+            lshr(left, right);
+          else {
+            GBE_ASSERT(right.getIntegerValue() <= (left.getElemNum() * left.getTypeSize() * 8));
+            GBE_ASSERT(right.getIntegerValue() % (left.getTypeSize() * 8) == 0);
+            copy(left, right.getIntegerValue() / (left.getTypeSize() * 8), left.getElemNum());
+          }
+          break;
+        }
+        case IMM_ASHR:
+        {
+          if (left.getElemNum() == 1)
+            *this = left >> right;
+          else {
+            GBE_ASSERT(0 && "Doesn't support ashr on array constant.");
+            copy(left, right.getIntegerValue() / (left.getTypeSize() * 8), left.getElemNum());
+          }
+          break;
+        }
+        case IMM_SHL:
+        {
+          if (left.getElemNum() == 1)
+            *this = left << right;
+          else {
+            GBE_ASSERT(right.getIntegerValue() <= (left.getElemNum() * left.getTypeSize() * 8));
+            GBE_ASSERT(right.getIntegerValue() % (left.getTypeSize() * 8) == 0);
+            copy(left, -right.getIntegerValue() / (left.getTypeSize() * 8), left.getElemNum());
+          }
+          break;
+        }
+      }
+      // If the dst type is large int, we will not change the imm type to large int.
+      GBE_ASSERT(type == (ImmType)dstType || dstType == TYPE_LARGE_INT);
+    }
+
+    Immediate::Immediate(const vector<const Immediate*> immVec) {
+      if (immVec.size() == 1) {
+        *this = *immVec[0];
+      } else if (!(immVec[0]->isCompType()) && immVec[0]->elemNum == 1) {
+        this->type = immVec[0]->type;
+        this->elemNum = immVec.size();
+        if (immVec[0]->getTypeSize() * immVec.size() < 8)
+          this->data.p = &this->defaultData;
+        else
+          this->data.p = malloc(immVec[0]->getTypeSize() * immVec.size());
+        uint8_t *p = (uint8_t*)this->data.p;
+        for(uint32_t i = 0; i < immVec.size(); i++) {
+          GBE_ASSERT(immVec[i]->type == immVec[0]->type && immVec[i]->elemNum == 1);
+          memcpy(p, immVec[i]->data.p, immVec[i]->getTypeSize());
+          p += immVec[i]->getTypeSize();
+        }
+      } else {
+        this->type = IMM_TYPE_COMP;
+        if (immVec.size() * sizeof(Immediate*) < 8)
+          this->data.p = &this->defaultData;
+        else
+          this->data.p = malloc(immVec.size() * sizeof(Immediate*));
+        this->elemNum = immVec.size();
+        for(uint32_t i = 0; i < immVec.size(); i++)
+          this->data.immVec[i] = immVec[i];
+      }
+    }
+
+
+    // operator = and copy() are only called from constructor functions
+    // which this never hold a memory pointer, we don't need to bother
+    // to check the data.p before assignment.
+    Immediate & Immediate::operator= (const Immediate & other) {
+      if (this != &other) {
+        type = other.type;
+        elemNum = other.elemNum;
+        if (other.data.p != &other.defaultData) {
+          data.p = malloc(other.elemNum * other.getTypeSize());
+          memcpy(data.p, other.data.p, other.elemNum * other.getTypeSize());
+        }
+        else {
+          defaultData = other.defaultData;
+          data.p = &defaultData;
+        }
+      }
+      return *this;
+    }
+
+    void Immediate::copy(const Immediate &other, int32_t offset, uint32_t num) {
+      if (this != &other) {
+        if (other.type == IMM_TYPE_COMP && num == 1) {
+          GBE_ASSERT(offset >= 0 && offset <= (int32_t)other.elemNum);
+          *this = *other.data.immVec[offset];
+          return;
+        }
+        type = other.type;
+        elemNum = num;
+        if (num * other.getTypeSize() < 8)
+          data.p = &defaultData;
+        else
+          data.p = malloc(num * other.getTypeSize());
+        uint8_t* datap = (uint8_t*)data.p;
+        memset(datap, 0, num * other.getTypeSize());
+        if (offset < 0) {
+          datap += (-offset) * other.getTypeSize();
+          num -= num < (uint32_t)(-offset) ? num : (-offset);
+          offset = 0;
+        } else if (offset > 0 && num > 1) {
+          GBE_ASSERT((int32_t)num > offset);
+          num -= offset;
+        }
+        memcpy(datap, (uint8_t*)other.data.p + offset * other.getTypeSize(),
+               num * other.getTypeSize());
+      }
+    }
diff --git a/backend/src/ir/immediate.hpp b/backend/src/ir/immediate.hpp
new file mode 100644
index 0000000..6a5c819
--- /dev/null
+++ b/backend/src/ir/immediate.hpp
@@ -0,0 +1,264 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file Immediate.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_IMMEDIATE_HPP__
+#define __GBE_IR_IMMEDIATE_HPP__
+
+#include <string.h>
+#include "ir/type.hpp"
+#include "sys/platform.hpp"
+
+namespace gbe {
+namespace ir {
+
+  typedef enum {
+    IMM_TRUNC = 0,
+    IMM_BITCAST,
+    IMM_ADD,
+    IMM_SUB,
+    IMM_MUL,
+    IMM_DIV,
+    IMM_REM,
+    IMM_SHL,
+    IMM_ASHR,
+    IMM_LSHR,
+    IMM_AND,
+    IMM_OR,
+    IMM_XOR
+  } ImmOpCode;
+
+  typedef enum {
+    IMM_TYPE_BOOL = TYPE_BOOL,
+    IMM_TYPE_S8 = TYPE_S8,
+    IMM_TYPE_U8 = TYPE_U8,
+    IMM_TYPE_S16 = TYPE_S16,
+    IMM_TYPE_U16 = TYPE_U16,
+    IMM_TYPE_S32 = TYPE_S32,
+    IMM_TYPE_U32 = TYPE_U32,
+    IMM_TYPE_S64 = TYPE_S64,
+    IMM_TYPE_U64 = TYPE_U64,
+    IMM_TYPE_FLOAT = TYPE_FLOAT,
+    IMM_TYPE_DOUBLE = TYPE_DOUBLE,
+    IMM_TYPE_COMP             // compond immediate which consist many immediates.
+  } ImmType;
+
+  /*! The value as stored in the instruction */
+  class Immediate
+  {
+  public:
+    INLINE Immediate(void) { }
+
+    INLINE Type getType(void) const {
+      return (Type)type;
+    }
+
+    INLINE bool isCompType(void) const {
+      return type == IMM_TYPE_COMP;
+    }
+
+    INLINE uint32_t getElemNum(void) const {
+      return elemNum;
+    }
+
+    uint32_t getTypeSize(void) const {
+      switch(type) {
+        default:
+          GBE_ASSERT(0 && "Invalid immeidate type.\n");
+        case TYPE_BOOL:
+        case TYPE_S8:
+        case TYPE_U8:   return 1;
+        case TYPE_S16:
+        case TYPE_U16:  return 2;
+        case TYPE_FLOAT:
+        case TYPE_S32:
+        case TYPE_U32:  return 4;
+        case TYPE_DOUBLE:
+        case TYPE_S64:
+        case TYPE_U64:  return 8;
+        case IMM_TYPE_COMP: return sizeof(Immediate*);
+      }
+    }
+
+#define DECL_CONSTRUCTOR(TYPE, FIELD, IR_TYPE)                  \
+    Immediate(TYPE FIELD) {                                     \
+      this->type = (ImmType)IR_TYPE;                            \
+      this->elemNum = 1;                                        \
+      this->data.p = &defaultData;                              \
+      defaultData = 0ull;                                       \
+      *this->data.FIELD = FIELD;                                \
+    }
+
+    DECL_CONSTRUCTOR(bool, b, TYPE_BOOL)
+    DECL_CONSTRUCTOR(int8_t, s8, TYPE_S8)
+    DECL_CONSTRUCTOR(uint8_t, u8, TYPE_U8)
+    DECL_CONSTRUCTOR(int16_t, s16, TYPE_S16)
+    DECL_CONSTRUCTOR(uint16_t, u16, TYPE_S16)
+    DECL_CONSTRUCTOR(int32_t, s32, TYPE_S32)
+    DECL_CONSTRUCTOR(uint32_t, u32, TYPE_S32)
+    DECL_CONSTRUCTOR(int64_t, s64, TYPE_S64)
+    DECL_CONSTRUCTOR(uint64_t, u64, TYPE_S64)
+    DECL_CONSTRUCTOR(float, f32, TYPE_FLOAT)
+    DECL_CONSTRUCTOR(double, f64, TYPE_DOUBLE)
+#undef DECL_CONSTRUCTOR
+
+#define DECL_CONSTRUCTOR(TYPE, FIELD, IR_TYPE, ELEMNUM)         \
+    Immediate(TYPE *FIELD, uint32_t ELEMNUM) {                  \
+      this->type = (ImmType)IR_TYPE;                            \
+      this->elemNum = ELEMNUM;                                  \
+      if (elemNum * ELEMNUM > 8)                                \
+        this->data.p = malloc(ELEMNUM * getTypeSize());         \
+      else                                                      \
+        this->data.p = &defaultData;                            \
+      defaultData = 0ull;                                       \
+      memcpy(this->data.FIELD, FIELD, ELEMNUM * getTypeSize()); \
+    }
+
+    DECL_CONSTRUCTOR(bool, b, TYPE_BOOL, elemNum)
+    DECL_CONSTRUCTOR(int8_t, s8, TYPE_S8, elemNum)
+    DECL_CONSTRUCTOR(uint8_t, u8, TYPE_U8, elemNum)
+    DECL_CONSTRUCTOR(int16_t, s16, TYPE_S16, elemNum)
+    DECL_CONSTRUCTOR(uint16_t, u16, TYPE_S16, elemNum)
+    DECL_CONSTRUCTOR(int32_t, s32, TYPE_S32, elemNum)
+    DECL_CONSTRUCTOR(uint32_t, u32, TYPE_S32, elemNum)
+    DECL_CONSTRUCTOR(int64_t, s64, TYPE_S64, elemNum)
+    DECL_CONSTRUCTOR(uint64_t, u64, TYPE_S64, elemNum)
+    DECL_CONSTRUCTOR(float, f32, TYPE_FLOAT, elemNum)
+    DECL_CONSTRUCTOR(double, f64, TYPE_DOUBLE, elemNum)
+#undef DECL_CONSTRUCTOR
+
+    Immediate(const vector<const Immediate*> immVec);
+
+    INLINE int64_t getIntegerValue(void) const {
+      switch (type) {
+        default:
+          GBE_ASSERT(0 && "Invalid immediate type.\n");
+        case TYPE_BOOL: return *data.b;
+        case TYPE_S8:   return *data.s8;
+        case TYPE_U8:   return *data.u8;
+        case TYPE_S16:  return *data.s16;
+        case TYPE_U16:  return *data.u16;
+        case TYPE_S32:  return *data.s32;
+        case TYPE_U32:  return *data.u32;
+        case TYPE_S64:  return *data.s64;
+        case TYPE_U64:  return *data.u64;
+      }
+    }
+
+    INLINE float getFloatValue(void) const {
+      GBE_ASSERT(type == IMM_TYPE_FLOAT);
+      return *data.f32;
+    }
+
+    INLINE float asFloatValue(void) const {
+      GBE_ASSERT(type == IMM_TYPE_FLOAT || type == IMM_TYPE_U32 || type == IMM_TYPE_S32);
+      return *data.f32;
+    }
+
+    INLINE int64_t asIntegerValue(void) const {
+      GBE_ASSERT(elemNum == 1);
+      return *data.s64;
+    }
+
+    INLINE double getDoubleValue(void) const {
+      GBE_ASSERT(type == IMM_TYPE_DOUBLE);
+      return *data.f64;
+    }
+   
+    INLINE Immediate(const Immediate & other) {
+      *this = other;
+    }
+
+    Immediate(ImmOpCode op, const Immediate &other, Type dstType) {
+      if (op == IMM_TRUNC) {
+        copy(other, 0, 1);
+      } else if (op == IMM_BITCAST) {
+        *this = other;
+        type = (ImmType)dstType;
+      }
+    }
+
+    Immediate(ImmOpCode op, const Immediate &left, const Immediate &right, Type dstType);
+
+    ~Immediate() {
+      if (data.p != &defaultData) {
+        free(data.p);
+        data.p = NULL;
+      }
+    }
+
+  private:
+    union {
+      bool *b;
+      int8_t *s8;
+      uint8_t *u8;
+      int16_t *s16;
+      uint16_t *u16;
+      int32_t *s32;
+      uint32_t *u32;
+      int64_t *s64;
+      uint64_t *u64;
+      float *f32;
+      double *f64;
+      const Immediate *immVec[];
+      void *p;
+    } data;     //!< Value to store
+    ImmType type;  //!< Type of the value
+    uint32_t elemNum; //!< vector imm data type
+    uint64_t defaultData;
+    Immediate & operator= (const Immediate &);
+    Immediate operator+ (const Immediate &) const; 
+    Immediate operator- (const Immediate &) const; 
+    Immediate operator* (const Immediate &) const; 
+    Immediate operator/ (const Immediate &) const; 
+    Immediate operator% (const Immediate &) const; 
+    Immediate operator& (const Immediate &) const; 
+    Immediate operator| (const Immediate &) const; 
+    Immediate operator^ (const Immediate &) const; 
+    Immediate operator<< (const Immediate &) const; 
+    Immediate operator>> (const Immediate &) const; 
+    static Immediate lshr (const Immediate &left, const Immediate &right);
+
+
+    void copy(const Immediate &other, int32_t offset, uint32_t num);
+    GBE_CLASS(Immediate);
+  };
+
+  /*! Compare two immediates */
+  INLINE bool operator< (const Immediate &imm0, const Immediate &imm1) {
+    if (imm0.getType() != imm1.getType())
+      return uint32_t(imm0.getType()) < uint32_t(imm1.getType());
+    else if (imm0.getType() == TYPE_FLOAT || imm0.getType() == TYPE_DOUBLE)
+      return imm0.asIntegerValue() < imm1.asIntegerValue();
+    else
+      return imm0.getIntegerValue() < imm1.getIntegerValue();
+  }
+
+  /*! A value is stored in a per-function vector. This is the index to it */
+  TYPE_SAFE(ImmediateIndex, uint16_t)
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_IMMEDIATE_HPP__ */
+
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
new file mode 100644
index 0000000..5fc1535
--- /dev/null
+++ b/backend/src/ir/instruction.cpp
@@ -0,0 +1,1684 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file instruction.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/instruction.hpp"
+#include "ir/function.hpp"
+
+namespace gbe {
+namespace ir {
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Implements the concrete implementations of the instruction classes. We
+  // cast an instruction to an internal class to run the given member function
+  ///////////////////////////////////////////////////////////////////////////
+  namespace internal
+  {
+#define ALIGNED_INSTRUCTION ALIGNED(ALIGNOF(Instruction))
+
+    /*! Policy shared by all the internal instructions */
+    struct BasePolicy {
+      /*! Create an instruction from its internal representation */
+      Instruction convert(void) const {
+        return Instruction(reinterpret_cast<const char *>(&this->opcode));
+      }
+      /*! Output the opcode in the given stream */
+      INLINE void outOpcode(std::ostream &out) const {
+        switch (opcode) {
+#define DECL_INSN(OPCODE, CLASS) case OP_##OPCODE: out << #OPCODE; break;
+#include "instruction.hxx"
+#undef DECL_INSN
+          case OP_INVALID: NOT_SUPPORTED; break;
+        };
+      }
+
+      /*! Instruction opcode */
+      Opcode opcode;
+    };
+
+    /*! For regular n source instructions */
+    template <typename T, uint32_t srcNum>
+    struct NSrcPolicy {
+      INLINE uint32_t getSrcNum(void) const { return srcNum; }
+      INLINE Register getSrc(const Function &fn, uint32_t ID) const {
+        GBE_ASSERTM((int) ID < (int) srcNum, "Out-of-bound source");
+        return static_cast<const T*>(this)->src[ID];
+      }
+      INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
+        GBE_ASSERTM((int) ID < (int) srcNum, "Out-of-bound source");
+        static_cast<T*>(this)->src[ID] = reg;
+      }
+    };
+
+    /*! For regular n destinations instructions */
+    template <typename T, uint32_t dstNum>
+    struct NDstPolicy {
+      INLINE uint32_t getDstNum(void) const { return dstNum; }
+      INLINE Register getDst(const Function &fn, uint32_t ID) const {
+        GBE_ASSERTM((int) ID < (int) dstNum, "Out-of-bound destination");
+        return static_cast<const T*>(this)->dst[ID];
+      }
+      INLINE void setDst(Function &fn, uint32_t ID, Register reg) {
+        GBE_ASSERTM((int) ID < (int) dstNum, "Out-of-bound destination");
+        static_cast<T*>(this)->dst[ID] = reg;
+      }
+    };
+
+    /*! For instructions that use a tuple for source */
+    template <typename T>
+    struct TupleSrcPolicy {
+      INLINE uint32_t getSrcNum(void) const {
+        return static_cast<const T*>(this)->srcNum;
+      }
+      INLINE Register getSrc(const Function &fn, uint32_t ID) const {
+        GBE_ASSERTM(ID < static_cast<const T*>(this)->srcNum, "Out-of-bound source register");
+        return fn.getRegister(static_cast<const T*>(this)->src, ID);
+      }
+      INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
+        GBE_ASSERTM(ID < static_cast<const T*>(this)->srcNum, "Out-of-bound source register");
+        return fn.setRegister(static_cast<T*>(this)->src, ID, reg);
+      }
+    };
+
+    /*! For instructions that use a tuple for destination */
+    template <typename T>
+    struct TupleDstPolicy {
+      INLINE uint32_t getDstNum(void) const {
+        return static_cast<const T*>(this)->dstNum;
+      }
+      INLINE Register getDst(const Function &fn, uint32_t ID) const {
+        GBE_ASSERTM(ID < static_cast<const T*>(this)->dstNum, "Out-of-bound source register");
+        return fn.getRegister(static_cast<const T*>(this)->dst, ID);
+      }
+      INLINE void setDst(Function &fn, uint32_t ID, Register reg) {
+        GBE_ASSERTM(ID < static_cast<const T*>(this)->dstNum, "Out-of-bound source register");
+        return fn.setRegister(static_cast<T*>(this)->dst, ID, reg);
+      }
+    };
+
+    /*! All unary and binary arithmetic instructions */
+    template <uint32_t srcNum> // 1 or 2
+    class ALIGNED_INSTRUCTION NaryInstruction :
+      public BasePolicy,
+      public NSrcPolicy<NaryInstruction<srcNum>, srcNum>,
+      public NDstPolicy<NaryInstruction<1>, 1>
+    {
+    public:
+      INLINE Type getType(void) const { return this->type; }
+      INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      Type type;            //!< Type of the instruction
+      Register dst[1];      //!< Index of the register in the register file
+      Register src[srcNum]; //!< Indices of the sources
+    };
+
+    /*! All 1-source arithmetic instructions */
+    class ALIGNED_INSTRUCTION UnaryInstruction : public NaryInstruction<1>
+    {
+    public:
+      UnaryInstruction(Opcode opcode, Type type, Register dst, Register src) {
+        this->opcode = opcode;
+        this->type = type;
+        this->dst[0] = dst;
+        this->src[0] = src;
+      }
+    };
+
+    /*! All 2-source arithmetic instructions */
+    class ALIGNED_INSTRUCTION BinaryInstruction : public NaryInstruction<2>
+    {
+    public:
+      BinaryInstruction(Opcode opcode,
+                        Type type,
+                        Register dst,
+                        Register src0,
+                        Register src1) {
+        this->opcode = opcode;
+        this->type = type;
+        this->dst[0] = dst;
+        this->src[0] = src0;
+        this->src[1] = src1;
+      }
+      INLINE bool commutes(void) const {
+        switch (opcode) {
+          case OP_ADD:
+          case OP_ADDSAT:
+          case OP_XOR:
+          case OP_OR:
+          case OP_AND:
+          case OP_MUL:
+            return true;
+          default:
+            return false;
+        }
+      }
+    };
+
+    class ALIGNED_INSTRUCTION TernaryInstruction :
+      public BasePolicy,
+      public NDstPolicy<TernaryInstruction, 1>,
+      public TupleSrcPolicy<TernaryInstruction>
+    {
+     public:
+      TernaryInstruction(Opcode opcode,
+                         Type type,
+                         Register dst,
+                         Tuple src) {
+        this->opcode = opcode;
+        this->type = type;
+        this->dst[0] = dst;
+        this->src = src;
+      }
+      Type getType(void) const { return type; }
+      bool wellFormed(const Function &fn, std::string &whyNot) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      Type type;
+      Register dst[1];
+      Tuple src;
+      static const uint32_t srcNum = 3;
+    };
+
+    /*! Three sources mean we need a tuple to encode it */
+    class ALIGNED_INSTRUCTION SelectInstruction :
+      public BasePolicy,
+      public NDstPolicy<SelectInstruction, 1>,
+      public TupleSrcPolicy<SelectInstruction>
+    {
+    public:
+      SelectInstruction(Type type, Register dst, Tuple src) {
+        this->opcode = OP_SEL;
+        this->type = type;
+        this->dst[0] = dst;
+        this->src = src;
+      }
+      INLINE Type getType(void) const { return this->type; }
+      INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      Type type;       //!< Type of the instruction
+      Register dst[1]; //!< Dst is the register index
+      Tuple src;       //!< 3 sources do not fit in 8 bytes -> use a tuple
+      static const uint32_t srcNum = 3;
+    };
+
+    /*! Comparison instructions take two sources of the same type and return a
+     *  boolean value. Since it is pretty similar to binary instruction, we
+     *  steal all the methods from it, except wellFormed (dst register is always
+     *  a boolean value)
+     */
+    class ALIGNED_INSTRUCTION CompareInstruction :
+      public NaryInstruction<2>
+    {
+    public:
+      CompareInstruction(Opcode opcode,
+                         Type type,
+                         Register dst,
+                         Register src0,
+                         Register src1)
+      {
+        this->opcode = opcode;
+        this->type = type;
+        this->dst[0] = dst;
+        this->src[0] = src0;
+        this->src[1] = src1;
+      }
+      INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+    };
+
+    class ALIGNED_INSTRUCTION BitCastInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<BitCastInstruction>,
+      public TupleDstPolicy<BitCastInstruction>
+    {
+    public:
+      BitCastInstruction(Type dstType,
+                         Type srcType,
+                         Tuple dst,
+                         Tuple src,
+                         uint8_t dstNum,
+                         uint8_t srcNum)
+      {
+        this->opcode = OP_BITCAST;
+        this->dst = dst;
+        this->src = src;
+        this->dstFamily = getFamily(dstType);
+        this->srcFamily = getFamily(srcType);
+        GBE_ASSERT(srcNum <= 16 && dstNum <= 16);
+        this->dstNum = dstNum;
+        this->srcNum = srcNum;
+      }
+      INLINE Type getSrcType(void) const { return getType((RegisterFamily)srcFamily); }
+      INLINE Type getDstType(void) const { return getType((RegisterFamily)dstFamily); }
+      INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      uint8_t dstFamily:4; //!< family to cast to
+      uint8_t srcFamily:4; //!< family to cast from
+      Tuple dst;
+      Tuple src;
+      uint8_t dstNum;     //!<Dst Number
+      uint8_t srcNum;     //!<Src Number
+    };
+
+    class ALIGNED_INSTRUCTION ConvertInstruction :
+      public BasePolicy,
+      public NDstPolicy<ConvertInstruction, 1>,
+      public NSrcPolicy<ConvertInstruction, 1>
+    {
+    public:
+      ConvertInstruction(Opcode opcode,
+                         Type dstType,
+                         Type srcType,
+                         Register dst,
+                         Register src)
+      {
+        this->opcode = opcode;
+        this->dst[0] = dst;
+        this->src[0] = src;
+        this->dstType = dstType;
+        this->srcType = srcType;
+      }
+      INLINE Type getSrcType(void) const { return this->srcType; }
+      INLINE Type getDstType(void) const { return this->dstType; }
+      INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      Register dst[1];
+      Register src[1];
+      Type dstType; //!< Type to convert to
+      Type srcType; //!< Type to convert from
+    };
+
+    class ALIGNED_INSTRUCTION AtomicInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<AtomicInstruction>,
+      public NDstPolicy<AtomicInstruction, 1>
+    {
+    public:
+      AtomicInstruction(AtomicOps atomicOp,
+                         Register dst,
+                         AddressSpace addrSpace,
+                         BTI bti,
+                         Tuple src)
+      {
+        this->opcode = OP_ATOMIC;
+        this->atomicOp = atomicOp;
+        this->dst[0] = dst;
+        this->src = src;
+        this->addrSpace = addrSpace;
+        this->bti = bti;
+        srcNum = 2;
+        if((atomicOp == ATOMIC_OP_INC) ||
+          (atomicOp == ATOMIC_OP_DEC))
+          srcNum = 1;
+        if(atomicOp == ATOMIC_OP_CMPXCHG)
+          srcNum = 3;
+      }
+      INLINE AddressSpace getAddressSpace(void) const { return this->addrSpace; }
+      INLINE BTI getBTI(void) const { return bti; }
+      INLINE AtomicOps getAtomicOpcode(void) const { return this->atomicOp; }
+      INLINE bool wellFormed(const Function &fn, std::string &whyNot) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      Register dst[1];
+      Tuple src;
+      AddressSpace addrSpace; //!< Address space
+      BTI bti;               //!< bti
+      uint8_t srcNum:2;     //!<Source Number
+      AtomicOps atomicOp:6;     //!<Source Number
+    };
+
+    class ALIGNED_INSTRUCTION BranchInstruction :
+      public BasePolicy,
+      public NDstPolicy<BranchInstruction, 0>
+    {
+    public:
+      INLINE BranchInstruction(Opcode op, LabelIndex labelIndex, Register predicate) {
+        GBE_ASSERT(op == OP_BRA);
+        this->opcode = op;
+        this->predicate = predicate;
+        this->labelIndex = labelIndex;
+        this->hasPredicate = true;
+        this->hasLabel = true;
+      }
+      INLINE BranchInstruction(Opcode op, LabelIndex labelIndex) {
+        GBE_ASSERT(op == OP_BRA);
+        this->opcode = OP_BRA;
+        this->labelIndex = labelIndex;
+        this->hasPredicate = false;
+        this->hasLabel = true;
+      }
+      INLINE BranchInstruction(Opcode op) {
+        GBE_ASSERT(op == OP_RET);
+        this->opcode = OP_RET;
+        this->hasPredicate = false;
+        this->hasLabel = false;
+      }
+      INLINE LabelIndex getLabelIndex(void) const {
+        GBE_ASSERTM(hasLabel, "No target label for this branch instruction");
+        return labelIndex;
+      }
+      INLINE uint32_t getSrcNum(void) const { return hasPredicate ? 1 : 0; }
+      INLINE Register getSrc(const Function &fn, uint32_t ID) const {
+        GBE_ASSERTM(hasPredicate, "No source for unpredicated branches");
+        GBE_ASSERTM(ID == 0, "Only one source for the branch instruction");
+        return predicate;
+      }
+      INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
+        GBE_ASSERTM(hasPredicate, "No source for unpredicated branches");
+        GBE_ASSERTM(ID == 0, "Only one source for the branch instruction");
+        predicate = reg;
+      }
+      INLINE bool isPredicated(void) const { return hasPredicate; }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      Register predicate;    //!< Predication means conditional branch
+      LabelIndex labelIndex; //!< Index of the label the branch targets
+      bool hasPredicate:1;   //!< Is it predicated?
+      bool hasLabel:1;       //!< Is there any target label?
+      Register dst[0];       //!< No destination
+    };
+
+    class ALIGNED_INSTRUCTION LoadInstruction :
+      public BasePolicy,
+      public NSrcPolicy<LoadInstruction, 1>
+    {
+    public:
+      LoadInstruction(Type type,
+                      Tuple dstValues,
+                      Register offset,
+                      AddressSpace addrSpace,
+                      uint32_t valueNum,
+                      bool dwAligned,
+                      BTI bti)
+      {
+        GBE_ASSERT(valueNum < 128);
+        this->opcode = OP_LOAD;
+        this->type = type;
+        this->offset = offset;
+        this->values = dstValues;
+        this->addrSpace = addrSpace;
+        this->valueNum = valueNum;
+        this->dwAligned = dwAligned ? 1 : 0;
+        this->bti = bti;
+      }
+      INLINE Register getDst(const Function &fn, uint32_t ID) const {
+        GBE_ASSERTM(ID < valueNum, "Out-of-bound source register");
+        return fn.getRegister(values, ID);
+      }
+      INLINE void setDst(Function &fn, uint32_t ID, Register reg) {
+        GBE_ASSERTM(ID < valueNum, "Out-of-bound source register");
+        fn.setRegister(values, ID, reg);
+      }
+      INLINE uint32_t getDstNum(void) const { return valueNum; }
+      INLINE Type getValueType(void) const { return type; }
+      INLINE uint32_t getValueNum(void) const { return valueNum; }
+      INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
+      INLINE BTI getBTI(void) const { return bti; }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      INLINE bool isAligned(void) const { return !!dwAligned; }
+      Type type;              //!< Type to store
+      Register src[0];        //!< Address where to load from
+      Register offset;        //!< Alias to make it similar to store
+      Tuple values;           //!< Values to load
+      AddressSpace addrSpace; //!< Where to load
+      BTI bti;
+      uint8_t valueNum:7;     //!< Number of values to load
+      uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
+    };
+
+    class ALIGNED_INSTRUCTION StoreInstruction :
+      public BasePolicy, public NDstPolicy<StoreInstruction, 0>
+    {
+    public:
+      StoreInstruction(Type type,
+                       Tuple values,
+                       Register offset,
+                       AddressSpace addrSpace,
+                       uint32_t valueNum,
+                       bool dwAligned,
+                       BTI bti)
+      {
+        GBE_ASSERT(valueNum < 255);
+        this->opcode = OP_STORE;
+        this->type = type;
+        this->offset = offset;
+        this->values = values;
+        this->addrSpace = addrSpace;
+        this->valueNum = valueNum;
+        this->dwAligned = dwAligned ? 1 : 0;
+        this->bti = bti;
+      }
+      INLINE Register getSrc(const Function &fn, uint32_t ID) const {
+        GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
+        if (ID == 0u)
+          return offset;
+        else
+          return fn.getRegister(values, ID - 1);
+      }
+      INLINE void setSrc(Function &fn, uint32_t ID, Register reg) {
+        GBE_ASSERTM(ID < valueNum + 1u, "Out-of-bound source register for store");
+        if (ID == 0u)
+          offset = reg;
+        else
+          fn.setRegister(values, ID - 1, reg);
+      }
+      INLINE uint32_t getSrcNum(void) const { return valueNum + 1u; }
+      INLINE uint32_t getValueNum(void) const { return valueNum; }
+      INLINE Type getValueType(void) const { return type; }
+      INLINE AddressSpace getAddressSpace(void) const { return addrSpace; }
+      INLINE BTI getBTI(void) const { return bti; }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      INLINE bool isAligned(void) const { return !!dwAligned; }
+      Type type;              //!< Type to store
+      Register offset;        //!< First source is the offset where to store
+      Tuple values;           //!< Values to store
+      AddressSpace addrSpace; //!< Where to store
+      BTI bti;                //!< Which btis need access
+      uint8_t valueNum:7;     //!< Number of values to store
+      uint8_t dwAligned:1;    //!< DWORD aligned is what matters with GEN
+      Register dst[0];        //!< No destination
+    };
+
+    class ALIGNED_INSTRUCTION SampleInstruction : // TODO
+      public BasePolicy,
+      public TupleSrcPolicy<SampleInstruction>,
+      public TupleDstPolicy<SampleInstruction>
+    {
+    public:
+      SampleInstruction(uint8_t imageIdx, Tuple dstTuple, Tuple srcTuple, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset) {
+        this->opcode = OP_SAMPLE;
+        this->dst = dstTuple;
+        this->src = srcTuple;
+        this->dstIsFloat = dstIsFloat;
+        this->srcIsFloat = srcIsFloat;
+        this->samplerIdx = sampler;
+        this->imageIdx = imageIdx;
+        this->samplerOffset = samplerOffset;
+      }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << "." << this->getDstType()
+            << "." << this->getSrcType()
+            << " surface id " << (int)this->getImageIndex()
+            << " coord u %" << this->getSrc(fn, 0)
+            << " coord v %" << this->getSrc(fn, 1)
+            << " coord w %" << this->getSrc(fn, 2)
+            << " %" << this->getDst(fn, 0)
+            << " %" << this->getDst(fn, 1)
+            << " %" << this->getDst(fn, 2)
+            << " %" << this->getDst(fn, 3)
+            << " sampler idx " << (int)this->getSamplerIndex();
+      }
+      Tuple src;
+      Tuple dst;
+
+      INLINE const uint8_t getImageIndex(void) const { return this->imageIdx; }
+      INLINE Type getSrcType(void) const { return this->srcIsFloat ? TYPE_FLOAT : TYPE_S32; }
+      INLINE Type getDstType(void) const { return this->dstIsFloat ? TYPE_FLOAT : TYPE_U32; }
+      INLINE const uint8_t getSamplerIndex(void) const { return this->samplerIdx; }
+      INLINE const uint8_t getSamplerOffset(void) const { return this->samplerOffset; }
+      uint8_t srcIsFloat:1;
+      uint8_t dstIsFloat:1;
+      uint8_t samplerIdx:4;
+      uint8_t samplerOffset:2;
+      uint8_t imageIdx;
+      static const uint32_t srcNum = 3;
+      static const uint32_t dstNum = 4;
+    };
+
+    class ALIGNED_INSTRUCTION TypedWriteInstruction : // TODO
+      public BasePolicy,
+      public TupleSrcPolicy<TypedWriteInstruction>,
+      public NDstPolicy<TypedWriteInstruction, 0>
+    {
+    public:
+
+      INLINE TypedWriteInstruction(uint8_t imageIdx, Tuple srcTuple, Type srcType, Type coordType) {
+        this->opcode = OP_TYPED_WRITE;
+        this->src = srcTuple;
+        this->coordType = coordType;
+        this->srcType = srcType;
+        this->imageIdx = imageIdx;
+      }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << "." << this->getSrcType()
+            << " surface id " << (int)this->getImageIndex()
+            << " coord u %" << this->getSrc(fn, 0)
+            << " coord v %" << this->getSrc(fn, 1)
+            << " coord w %" << this->getSrc(fn, 2)
+            << " %" << this->getSrc(fn, 3)
+            << " %" << this->getSrc(fn, 4)
+            << " %" << this->getSrc(fn, 5)
+            << " %" << this->getSrc(fn, 6);
+      }
+
+      Tuple src;
+      uint8_t srcType;
+      uint8_t coordType;
+      uint8_t imageIdx;
+
+      INLINE const uint8_t getImageIndex(void) const { return this->imageIdx; }
+      INLINE Type getSrcType(void) const { return (Type)this->srcType; }
+      INLINE Type getCoordType(void) const { return (Type)this->coordType; }
+      // bti, u, v, w, 4 data elements
+      static const uint32_t srcNum = 7;
+      Register dst[0];               //!< No dest register
+    };
+
+    class ALIGNED_INSTRUCTION GetImageInfoInstruction :
+      public BasePolicy,
+      public NSrcPolicy<GetImageInfoInstruction, 1>,
+      public NDstPolicy<GetImageInfoInstruction, 1>
+    {
+    public:
+      GetImageInfoInstruction( int type,
+                               Register dst,
+                               uint8_t imageIdx,
+                               Register infoReg)
+      {
+        this->opcode = OP_GET_IMAGE_INFO;
+        this->infoType = type;
+        this->dst[0] = dst;
+        this->src[0] = infoReg;
+        this->imageIdx = imageIdx;
+      }
+
+      INLINE uint32_t getInfoType(void) const { return infoType; }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << "." << this->getInfoType()
+            << " %" << this->getDst(fn, 0)
+            << " surface id " << (int)this->getImageIndex()
+            << " info reg %" << this->getSrc(fn, 0);
+      }
+
+      INLINE const uint8_t getImageIndex(void) const { return imageIdx; }
+
+      uint8_t infoType;                 //!< Type of the requested information.
+      uint8_t imageIdx;                //!< surface index.
+      Register src[1];                  //!< surface info register.
+      Register dst[1];                  //!< dest register to put the information.
+      static const uint32_t dstNum = 1;
+    };
+
+    class ALIGNED_INSTRUCTION LoadImmInstruction :
+      public BasePolicy,
+      public NSrcPolicy<LoadImmInstruction, 0>,
+      public NDstPolicy<LoadImmInstruction, 1>
+    {
+    public:
+      INLINE LoadImmInstruction(Type type, Register dst, ImmediateIndex index)
+      {
+        this->dst[0] = dst;
+        this->opcode = OP_LOADI;
+        this->immediateIndex = index;
+        this->type = type;
+      }
+      INLINE Immediate getImmediate(const Function &fn) const {
+        return fn.getImmediate(immediateIndex);
+      }
+      INLINE Type getType(void) const { return this->type; }
+      bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      Register dst[1];               //!< RegisterData to store into
+      Register src[0];               //!< No source register
+      ImmediateIndex immediateIndex; //!< Index in the vector of immediates
+      Type type;                     //!< Type of the immediate
+    };
+
+    class ALIGNED_INSTRUCTION SyncInstruction :
+      public BasePolicy,
+      public NSrcPolicy<SyncInstruction, 0>,
+      public NDstPolicy<SyncInstruction, 0>
+    {
+    public:
+      INLINE SyncInstruction(uint32_t parameters) {
+        this->opcode = OP_SYNC;
+        this->parameters = parameters;
+      }
+      INLINE uint32_t getParameters(void) const { return this->parameters; }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      uint32_t parameters;
+      Register dst[0], src[0];
+    };
+
+    class ALIGNED_INSTRUCTION LabelInstruction :
+      public BasePolicy,
+      public NSrcPolicy<LabelInstruction, 0>,
+      public NDstPolicy<LabelInstruction, 0>
+    {
+    public:
+      INLINE LabelInstruction(LabelIndex labelIndex) {
+        this->opcode = OP_LABEL;
+        this->labelIndex = labelIndex;
+      }
+      INLINE LabelIndex getLabelIndex(void) const { return labelIndex; }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      LabelIndex labelIndex;  //!< Index of the label
+      Register dst[0], src[0];
+    };
+
+#undef ALIGNED_INSTRUCTION
+
+    /////////////////////////////////////////////////////////////////////////
+    // Implements all the wellFormed methods
+    /////////////////////////////////////////////////////////////////////////
+
+    /*! All Nary instruction registers must be of the same family and properly
+     *  defined (i.e. not out-of-bound)
+     */
+    static INLINE bool checkRegisterData(RegisterFamily family,
+                                         const Register &ID,
+                                         const Function &fn,
+                                         std::string &whyNot)
+    {
+      if (UNLIKELY(uint16_t(ID) >= fn.regNum())) {
+        whyNot = "Out-of-bound destination register index";
+        return false;
+      }
+      const RegisterData reg = fn.getRegisterData(ID);
+      if (UNLIKELY(reg.family != family)) {
+        whyNot = "Destination family does not match instruction type";
+        return false;
+      }
+      return true;
+    }
+
+    /*! Special registers are *not* writeable */
+    static INLINE bool checkSpecialRegForWrite(const Register &reg,
+                                               const Function &fn,
+                                               std::string &whyNot)
+    {
+      if (fn.isSpecialReg(reg) == true && reg != ir::ocl::stackptr) {
+        whyNot = "Non stack pointer special registers are not writeable";
+        return false;
+      }
+      return true;
+    }
+
+    /*! We check that the given type belongs to the provided type family */
+    static INLINE bool checkTypeFamily(const Type &type,
+                                       const Type *family,
+                                       uint32_t typeNum,
+                                       std::string &whyNot)
+    {
+      uint32_t typeID = 0;
+      for (; typeID < typeNum; ++typeID)
+        if (family[typeID] == type)
+          break;
+      if (typeID == typeNum) {
+        whyNot = "Type is not supported by the instruction";
+        return false;
+      }
+      return true;
+    }
+
+#define CHECK_TYPE(TYPE, FAMILY) \
+  do { \
+    if (UNLIKELY(checkTypeFamily(TYPE, FAMILY, FAMILY##Num, whyNot)) == false) \
+      return false; \
+  } while (0)
+
+    static const Type madType[] = {TYPE_FLOAT};
+    static const uint32_t madTypeNum = ARRAY_ELEM_NUM(madType);
+
+    // TODO add support for 64 bits values
+    static const Type allButBool[] = {TYPE_S8,  TYPE_U8,
+                                      TYPE_S16, TYPE_U16,
+                                      TYPE_S32, TYPE_U32,
+                                      TYPE_S64, TYPE_U64,
+                                      TYPE_FLOAT, TYPE_DOUBLE};
+    static const uint32_t allButBoolNum = ARRAY_ELEM_NUM(allButBool);
+
+    // TODO add support for 64 bits values
+    static const Type logicalType[] = {TYPE_S8,  TYPE_U8,
+                                       TYPE_S16, TYPE_U16,
+                                       TYPE_S32, TYPE_U32,
+                                       TYPE_S64, TYPE_U64,
+                                       TYPE_BOOL};
+    static const uint32_t logicalTypeNum = ARRAY_ELEM_NUM(logicalType);
+
+    // Unary and binary instructions share the same rules
+    template <uint32_t srcNum>
+    INLINE bool NaryInstruction<srcNum>::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      const RegisterFamily family = getFamily(this->type);
+      if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
+        return false;
+      for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
+        if (UNLIKELY(checkRegisterData(family, src[srcID], fn, whyNot) == false))
+          return false;
+      // We actually support logical operations on boolean values for AND, OR,
+      // and XOR
+      switch (this->opcode) {
+        case OP_OR:
+        case OP_XOR:
+        case OP_AND:
+          CHECK_TYPE(this->type, logicalType);
+          break;
+        default:
+          CHECK_TYPE(this->type, allButBool);
+          break;
+        case OP_MOV:
+          break;
+        case OP_POW:
+        case OP_COS:
+        case OP_SIN:
+        case OP_RCP:
+        case OP_ABS:
+        case OP_RSQ:
+        case OP_SQR:
+        case OP_RNDD:
+        case OP_RNDE:
+        case OP_RNDU:
+        case OP_RNDZ:
+          const Type fp = TYPE_FLOAT;
+          if (UNLIKELY(checkTypeFamily(TYPE_FLOAT, &fp, 1, whyNot)) == false)
+            return false;
+          break;
+      }
+      return true;
+    }
+
+    // First source must a boolean. Other must match the destination type
+    INLINE bool SelectInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      const RegisterFamily family = getFamily(this->type);
+      if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(src + 3u > fn.tupleNum())) {
+        whyNot = "Out-of-bound index for ternary instruction";
+        return false;
+      }
+      const Register regID = fn.getRegister(src, 0);
+      if (UNLIKELY(checkRegisterData(FAMILY_BOOL, regID, fn, whyNot) == false))
+        return false;
+      for (uint32_t srcID = 1; srcID < 3; ++srcID) {
+        const Register regID = fn.getRegister(src, srcID);
+        if (UNLIKELY(checkRegisterData(family, regID, fn, whyNot) == false))
+          return false;
+      }
+      CHECK_TYPE(this->type, allButBool);
+      return true;
+    }
+
+    // Pretty similar to binary instruction. Only the destination is of type
+    // boolean
+    INLINE bool CompareInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(checkRegisterData(FAMILY_BOOL, dst[0], fn, whyNot) == false))
+        return false;
+      const RegisterFamily family = getFamily(this->type);
+      for (uint32_t srcID = 0; srcID < 2; ++srcID)
+        if (UNLIKELY(checkRegisterData(family, src[srcID], fn, whyNot) == false))
+          return false;
+      CHECK_TYPE(this->type, allButBool);
+      return true;
+    }
+
+    // The bit sizes of src and the dst must be identical, and don't support bool now, bool need double check.
+    INLINE bool BitCastInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+        if (UNLIKELY(checkSpecialRegForWrite(getDst(fn, dstID), fn, whyNot) == false))
+          return false;
+        if (UNLIKELY(checkRegisterData((RegisterFamily)dstFamily, getDst(fn, dstID), fn, whyNot) == false))
+          return false;
+      }
+      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+        if (UNLIKELY(checkRegisterData((RegisterFamily)srcFamily, getSrc(fn, srcID), fn, whyNot) == false))
+          return false;
+      }
+
+      CHECK_TYPE(getType((RegisterFamily)dstFamily), allButBool);
+      CHECK_TYPE(getType((RegisterFamily)srcFamily), allButBool);
+
+      uint32_t dstBytes = 0, srcBtyes = 0;
+      dstBytes = dstNum * getFamilySize((RegisterFamily)dstFamily);
+      srcBtyes = srcNum * getFamilySize((RegisterFamily)srcFamily);
+
+      if(dstBytes != srcBtyes){
+        whyNot = " The bit sizes of src and the dst is not identical.";
+        return false;
+      }
+
+      return true;
+    }
+
+    // We can convert anything to anything, but types and families must match
+    INLINE bool ConvertInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      const RegisterFamily dstFamily = getFamily(dstType);
+      const RegisterFamily srcFamily = getFamily(srcType);
+      if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(checkRegisterData(dstFamily, dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(checkRegisterData(srcFamily, src[0], fn, whyNot) == false))
+        return false;
+      CHECK_TYPE(this->dstType, allButBool);
+      CHECK_TYPE(this->srcType, allButBool);
+      return true;
+    }
+
+    // We can convert anything to anything, but types and families must match
+    INLINE bool AtomicInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false))
+        return false;
+      for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
+        if (UNLIKELY(checkRegisterData(FAMILY_DWORD, getSrc(fn, srcID), fn, whyNot) == false))
+          return false;
+
+      return true;
+    }
+
+    INLINE bool TernaryInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      const RegisterFamily family = getFamily(this->type);
+      if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(src + 3u > fn.tupleNum())) {
+        whyNot = "Out-of-bound index for ternary instruction";
+        return false;
+      }
+      for (uint32_t srcID = 0; srcID < 3; ++srcID) {
+        const Register regID = fn.getRegister(src, srcID);
+        if (UNLIKELY(checkRegisterData(family, regID, fn, whyNot) == false))
+          return false;
+      }
+      return true;
+    }
+
+    /*! Loads and stores follow the same restrictions */
+    template <typename T>
+    INLINE bool wellFormedLoadStore(const T &insn, const Function &fn, std::string &whyNot)
+    {
+      if (UNLIKELY(insn.offset >= fn.regNum())) {
+        whyNot = "Out-of-bound offset register index";
+        return false;
+      }
+      if (UNLIKELY(insn.values + insn.valueNum > fn.tupleNum())) {
+        whyNot = "Out-of-bound tuple index";
+        return false;
+      }
+      // Check all registers
+      const RegisterFamily family = getFamily(insn.type);
+      for (uint32_t valueID = 0; valueID < insn.valueNum; ++valueID) {
+        const Register regID = fn.getRegister(insn.values, valueID);
+        if (UNLIKELY(checkRegisterData(family, regID, fn, whyNot) == false))
+          return false;
+      }
+      return true;
+    }
+
+    INLINE bool LoadInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      const uint32_t dstNum = this->getDstNum();
+      for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+        const Register reg = this->getDst(fn, dstID);
+        const bool isOK = checkSpecialRegForWrite(reg, fn, whyNot);
+        if (UNLIKELY(isOK == false)) return false;
+      }
+      if (UNLIKELY(dstNum > Instruction::MAX_DST_NUM)) {
+        whyNot = "Too many destinations for load instruction";
+        return false;
+      }
+      return wellFormedLoadStore(*this, fn, whyNot);
+    }
+
+    INLINE bool StoreInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      const uint32_t srcNum = this->getSrcNum();
+      if (UNLIKELY(srcNum > Instruction::MAX_SRC_NUM)) {
+        whyNot = "Too many source for store instruction";
+        return false;
+      }
+      return wellFormedLoadStore(*this, fn, whyNot);
+    }
+
+    // TODO
+    INLINE bool SampleInstruction::wellFormed(const Function &fn, std::string &why) const
+    { return true; }
+    INLINE bool TypedWriteInstruction::wellFormed(const Function &fn, std::string &why) const
+    { return true; }
+    INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const
+    { return true; }
+
+
+    // Ensure that types and register family match
+    INLINE bool LoadImmInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      if (UNLIKELY(immediateIndex >= fn.immediateNum())) {
+        whyNot = "Out-of-bound immediate value index";
+        return false;
+      }
+      const ir::Type immType = fn.getImmediate(immediateIndex).getType();
+      if (UNLIKELY(type != immType)) {
+        whyNot = "Inconsistant type for the immediate value to load";
+        return false;
+      }
+      const RegisterFamily family = getFamily(type);
+      if (UNLIKELY(checkSpecialRegForWrite(dst[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
+        return false;
+      //Support all type IMM, disable check
+      //CHECK_TYPE(this->type, allButBool);
+      return true;
+    }
+
+    INLINE bool SyncInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      const uint32_t maxParams = SYNC_WORKGROUP_EXEC |
+                                 SYNC_LOCAL_READ_FENCE |
+                                 SYNC_LOCAL_WRITE_FENCE |
+                                 SYNC_GLOBAL_READ_FENCE |
+                                 SYNC_GLOBAL_WRITE_FENCE;
+      if (UNLIKELY(this->parameters > maxParams)) {
+        whyNot = "Invalid parameters for sync instruction";
+        return false;
+      } else if (UNLIKELY(this->parameters == 0)) {
+        whyNot = "Missing parameters for sync instruction";
+        return false;
+      }
+      return true;
+    }
+
+    // Only a label index is required
+    INLINE bool LabelInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      if (UNLIKELY(labelIndex >= fn.labelNum())) {
+        whyNot = "Out-of-bound label index";
+        return false;
+      }
+      return true;
+    }
+
+    // The label must exist and the register must of boolean family
+    INLINE bool BranchInstruction::wellFormed(const Function &fn, std::string &whyNot) const {
+      if (hasLabel)
+        if (UNLIKELY(labelIndex >= fn.labelNum())) {
+          whyNot = "Out-of-bound label index";
+          return false;
+        }
+      if (hasPredicate)
+        if (UNLIKELY(checkRegisterData(FAMILY_BOOL, predicate, fn, whyNot) == false))
+          return false;
+      return true;
+    }
+
+#undef CHECK_TYPE
+
+    /////////////////////////////////////////////////////////////////////////
+    // Implements all the output stream methods
+    /////////////////////////////////////////////////////////////////////////
+    template <uint32_t srcNum>
+    INLINE void NaryInstruction<srcNum>::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      out << "." << this->getType()
+          << " %" << this->getDst(fn, 0);
+      for (uint32_t i = 0; i < srcNum; ++i)
+        out << " %" << this->getSrc(fn, i);
+    }
+
+    template <typename T>
+    static void ternaryOrSelectOut(const T &insn, std::ostream &out, const Function &fn) {
+      insn.outOpcode(out);
+      out << "." << insn.getType()
+          << " %" << insn.getDst(fn, 0)
+          << " %" << insn.getSrc(fn, 0)
+          << " %" << insn.getSrc(fn, 1)
+          << " %" << insn.getSrc(fn, 2);
+    }
+
+    INLINE void SelectInstruction::out(std::ostream &out, const Function &fn) const {
+      ternaryOrSelectOut(*this, out, fn);
+    }
+
+    INLINE void TernaryInstruction::out(std::ostream &out, const Function &fn) const {
+      ternaryOrSelectOut(*this, out, fn);
+    }
+
+    INLINE void AtomicInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      out << "." << addrSpace;
+      out << " %" << this->getDst(fn, 0);
+      out << " {" << "%" << this->getSrc(fn, 0) << "}";
+      for (uint32_t i = 1; i < srcNum; ++i)
+        out << " %" << this->getSrc(fn, i);
+      out << " bti";
+      for (uint32_t i = 0; i < bti.count; ++i)
+        out << ": " << (int)bti.bti[i];
+    }
+
+
+    INLINE void BitCastInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      out << "." << this->getDstType()
+          << "." << this->getSrcType();
+      out << " {";
+      for (uint32_t i = 0; i < dstNum; ++i)
+        out << "%" << this->getDst(fn, i) << (i != (dstNum-1u) ? " " : "");
+      out << "}";
+      out << " {";
+      for (uint32_t i = 0; i < srcNum; ++i)
+        out << "%" << this->getSrc(fn, i) << (i != (srcNum-1u) ? " " : "");
+      out << "}";
+    }
+
+
+    INLINE void ConvertInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      out << "." << this->getDstType()
+          << "." << this->getSrcType()
+          << " %" << this->getDst(fn, 0)
+          << " %" << this->getSrc(fn, 0);
+    }
+
+    INLINE void LoadInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      out << "." << type << "." << addrSpace << (dwAligned ? "." : ".un") << "aligned";
+      out << " {";
+      for (uint32_t i = 0; i < valueNum; ++i)
+        out << "%" << this->getDst(fn, i) << (i != (valueNum-1u) ? " " : "");
+      out << "}";
+      out << " %" << this->getSrc(fn, 0);
+      out << " bti";
+      for (uint32_t i = 0; i < bti.count; ++i)
+        out << ": " << (int)bti.bti[i];
+    }
+
+    INLINE void StoreInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      out << "." << type << "." << addrSpace << (dwAligned ? "." : ".un") << "aligned";
+      out << " %" << this->getSrc(fn, 0) << " {";
+      for (uint32_t i = 0; i < valueNum; ++i)
+        out << "%" << this->getSrc(fn, i+1) << (i != (valueNum-1u) ? " " : "");
+      out << "}";
+      out << " bti";
+      for (uint32_t i = 0; i < bti.count; ++i)
+        out << ": " << (int)bti.bti[i];
+    }
+
+    INLINE void LabelInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      out << " $" << labelIndex;
+    }
+
+    INLINE void BranchInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      if (hasPredicate)
+        out << "<%" << this->getSrc(fn, 0) << ">";
+      if (hasLabel) out << " -> label$" << labelIndex;
+    }
+
+    INLINE void LoadImmInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      out << "." << type;
+      out << " %" << this->getDst(fn,0) << " ";
+      fn.outImmediate(out, immediateIndex);
+    }
+
+    static const char *syncStr[syncFieldNum] = {
+      "workgroup", "local_read", "local_write", "global_read", "global_write"
+    };
+
+    INLINE void SyncInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      for (uint32_t field = 0; field < syncFieldNum; ++field)
+        if (this->parameters & (1 << field))
+          out << "." << syncStr[field];
+    }
+
+
+  } /* namespace internal */
+
+  std::ostream &operator<< (std::ostream &out, AddressSpace addrSpace) {
+    switch (addrSpace) {
+      case MEM_GLOBAL: return out << "global";
+      case MEM_LOCAL: return out << "local";
+      case MEM_CONSTANT: return out << "constant";
+      case MEM_PRIVATE: return out << "private";
+      case IMAGE: return out << "image";
+      case MEM_INVALID: return out << "invalid";
+    };
+    return out;
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Implements the various introspection functions
+  ///////////////////////////////////////////////////////////////////////////
+  template <typename T, typename U> struct HelperIntrospection {
+    enum { value = 0 };
+  };
+  template <typename T> struct HelperIntrospection<T,T> {
+    enum { value = 1 };
+  };
+
+  RegisterData Instruction::getDstData(uint32_t ID) const {
+    const Function &fn = this->getFunction();
+    return fn.getRegisterData(this->getDst(ID));
+  }
+  RegisterData Instruction::getSrcData(uint32_t ID) const {
+    const Function &fn = this->getFunction();
+    return fn.getRegisterData(this->getSrc(ID));
+  }
+
+#define DECL_INSN(OPCODE, CLASS) \
+  case OP_##OPCODE: \
+  return HelperIntrospection<CLASS, RefClass>::value == 1;
+
+#define START_INTROSPECTION(CLASS) \
+  static_assert(sizeof(internal::CLASS) == (sizeof(uint64_t)*2), \
+                "Bad instruction size"); \
+  static_assert(offsetof(internal::CLASS, opcode) == 0, \
+                "Bad opcode offset"); \
+  bool CLASS::isClassOf(const Instruction &insn) { \
+    const Opcode op = insn.getOpcode(); \
+    typedef CLASS RefClass; \
+    switch (op) {
+
+#define END_INTROSPECTION(CLASS) \
+      default: return false; \
+    }; \
+  }
+
+START_INTROSPECTION(UnaryInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(UnaryInstruction)
+
+START_INTROSPECTION(BinaryInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(BinaryInstruction)
+
+START_INTROSPECTION(CompareInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(CompareInstruction)
+
+START_INTROSPECTION(BitCastInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(BitCastInstruction)
+
+START_INTROSPECTION(ConvertInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(ConvertInstruction)
+
+START_INTROSPECTION(AtomicInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(AtomicInstruction)
+
+START_INTROSPECTION(SelectInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(SelectInstruction)
+
+START_INTROSPECTION(TernaryInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(TernaryInstruction)
+
+START_INTROSPECTION(BranchInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(BranchInstruction)
+
+START_INTROSPECTION(SampleInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(SampleInstruction)
+
+START_INTROSPECTION(TypedWriteInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(TypedWriteInstruction)
+
+START_INTROSPECTION(GetImageInfoInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(GetImageInfoInstruction)
+
+START_INTROSPECTION(LoadImmInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(LoadImmInstruction)
+
+START_INTROSPECTION(LoadInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(LoadInstruction)
+
+START_INTROSPECTION(StoreInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(StoreInstruction)
+
+START_INTROSPECTION(SyncInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(SyncInstruction)
+
+START_INTROSPECTION(LabelInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(LabelInstruction)
+
+#undef END_INTROSPECTION
+#undef START_INTROSPECTION
+#undef DECL_INSN
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Implements the function dispatching from public to internal with some
+  // macro horrors
+  ///////////////////////////////////////////////////////////////////////////
+
+#define DECL_INSN(OPCODE, CLASS) \
+  case OP_##OPCODE: return reinterpret_cast<const internal::CLASS*>(this)->CALL;
+
+#define START_FUNCTION(CLASS, RET, PROTOTYPE) \
+  RET CLASS::PROTOTYPE const { \
+    const Opcode op = this->getOpcode(); \
+    switch (op) {
+
+#define END_FUNCTION(CLASS, RET) \
+      case OP_INVALID: return RET(); \
+    }; \
+    return RET(); \
+  }
+
+#define CALL getSrcNum()
+START_FUNCTION(Instruction, uint32_t, getSrcNum(void))
+#include "ir/instruction.hxx"
+END_FUNCTION(Instruction, uint32_t)
+#undef CALL
+
+#define CALL getDstNum()
+START_FUNCTION(Instruction, uint32_t, getDstNum(void))
+#include "ir/instruction.hxx"
+END_FUNCTION(Instruction, uint32_t)
+#undef CALL
+
+#undef DECL_INSN
+
+#define DECL_INSN(OPCODE, CLASS) \
+  case OP_##OPCODE: \
+  { \
+    const Function &fn = this->getFunction(); \
+    return reinterpret_cast<const internal::CLASS*>(this)->CALL; \
+  }
+
+#define CALL wellFormed(fn, whyNot)
+START_FUNCTION(Instruction, bool, wellFormed(std::string &whyNot))
+#include "ir/instruction.hxx"
+END_FUNCTION(Instruction, bool)
+#undef CALL
+
+#define CALL getDst(fn, ID)
+START_FUNCTION(Instruction, Register, getDst(uint32_t ID))
+#include "ir/instruction.hxx"
+END_FUNCTION(Instruction, Register)
+#undef CALL
+
+#define CALL getSrc(fn, ID)
+START_FUNCTION(Instruction, Register, getSrc(uint32_t ID))
+#include "ir/instruction.hxx"
+END_FUNCTION(Instruction, Register)
+#undef CALL
+
+#undef DECL_INSN
+#undef END_FUNCTION
+#undef START_FUNCTION
+
+  void Instruction::setSrc(uint32_t srcID, Register reg) {
+    Function &fn = this->getFunction();
+#if GBE_DEBUG
+    const RegisterData oldData = this->getSrcData(srcID);
+    const RegisterData newData = fn.getRegisterData(reg);
+    GBE_ASSERT(oldData.family == newData.family);
+#endif /* GBE_DEBUG */
+    const Opcode op = this->getOpcode();
+    switch (op) {
+#define DECL_INSN(OP, FAMILY)\
+      case OP_##OP:\
+        reinterpret_cast<internal::FAMILY*>(this)->setSrc(fn, srcID, reg);\
+      break;
+#include "instruction.hxx"
+#undef DECL_INSN
+      case OP_INVALID: NOT_SUPPORTED; break;
+    };
+  }
+
+  void Instruction::setDst(uint32_t dstID, Register reg) {
+    Function &fn = this->getFunction();
+#if GBE_DEBUG
+    const RegisterData oldData = this->getDstData(dstID);
+    const RegisterData newData = fn.getRegisterData(reg);
+    GBE_ASSERT(oldData.family == newData.family);
+#endif /* GBE_DEBUG */
+    const Opcode op = this->getOpcode();
+    switch (op) {
+#define DECL_INSN(OP, FAMILY)\
+      case OP_##OP:\
+        reinterpret_cast<internal::FAMILY*>(this)->setDst(fn, dstID, reg);\
+      break;
+#include "instruction.hxx"
+#undef DECL_INSN
+      case OP_INVALID: NOT_SUPPORTED; break;
+    };
+  }
+
+  const Function &Instruction::getFunction(void) const {
+    const BasicBlock *bb = this->getParent();
+    GBE_ASSERT(bb != NULL);
+    return bb->getParent();
+  }
+  Function &Instruction::getFunction(void) {
+    BasicBlock *bb = this->getParent();
+    GBE_ASSERT(bb != NULL);
+    return bb->getParent();
+  }
+
+  void Instruction::replace(Instruction *other) const {
+    Function &fn = other->getFunction();
+    Instruction *insn = fn.newInstruction(*this);
+    intrusive_list_node *prev = other->prev;
+    insn->parent = other->parent;
+    other->remove();
+    append(insn, prev);
+  }
+
+  void Instruction::remove(void) {
+    Function &fn = this->getFunction();
+    unlink(this);
+    fn.deleteInstruction(this);
+  }
+
+  void Instruction::insert(Instruction *prev, Instruction ** new_ins) {
+    Function &fn = prev->getFunction();
+    Instruction *insn = fn.newInstruction(*this);
+    insn->parent = prev->parent;
+    append(insn, prev);
+    if (new_ins)
+      *new_ins = insn;
+  }
+
+  bool Instruction::hasSideEffect(void) const {
+    return opcode == OP_STORE ||
+           opcode == OP_TYPED_WRITE ||
+           opcode == OP_SYNC ||
+           opcode == OP_ATOMIC;
+  }
+
+#define DECL_MEM_FN(CLASS, RET, PROTOTYPE, CALL) \
+  RET CLASS::PROTOTYPE const { \
+    return reinterpret_cast<const internal::CLASS*>(this)->CALL; \
+  }
+
+DECL_MEM_FN(UnaryInstruction, Type, getType(void), getType())
+DECL_MEM_FN(BinaryInstruction, Type, getType(void), getType())
+DECL_MEM_FN(BinaryInstruction, bool, commutes(void), commutes())
+DECL_MEM_FN(SelectInstruction, Type, getType(void), getType())
+DECL_MEM_FN(TernaryInstruction, Type, getType(void), getType())
+DECL_MEM_FN(CompareInstruction, Type, getType(void), getType())
+DECL_MEM_FN(BitCastInstruction, Type, getSrcType(void), getSrcType())
+DECL_MEM_FN(BitCastInstruction, Type, getDstType(void), getDstType())
+DECL_MEM_FN(ConvertInstruction, Type, getSrcType(void), getSrcType())
+DECL_MEM_FN(ConvertInstruction, Type, getDstType(void), getDstType())
+DECL_MEM_FN(AtomicInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(AtomicInstruction, BTI, getBTI(void), getBTI())
+DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), getAtomicOpcode())
+DECL_MEM_FN(StoreInstruction, Type, getValueType(void), getValueType())
+DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum())
+DECL_MEM_FN(StoreInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(StoreInstruction, BTI, getBTI(void), getBTI())
+DECL_MEM_FN(StoreInstruction, bool, isAligned(void), isAligned())
+DECL_MEM_FN(LoadInstruction, Type, getValueType(void), getValueType())
+DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum())
+DECL_MEM_FN(LoadInstruction, AddressSpace, getAddressSpace(void), getAddressSpace())
+DECL_MEM_FN(LoadInstruction, BTI, getBTI(void), getBTI())
+DECL_MEM_FN(LoadInstruction, bool, isAligned(void), isAligned())
+DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
+DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
+DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated())
+DECL_MEM_FN(BranchInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
+DECL_MEM_FN(SyncInstruction, uint32_t, getParameters(void), getParameters())
+DECL_MEM_FN(SampleInstruction, Type, getSrcType(void), getSrcType())
+DECL_MEM_FN(SampleInstruction, Type, getDstType(void), getDstType())
+DECL_MEM_FN(SampleInstruction, const uint8_t, getSamplerIndex(void), getSamplerIndex())
+DECL_MEM_FN(SampleInstruction, const uint8_t, getSamplerOffset(void), getSamplerOffset())
+DECL_MEM_FN(SampleInstruction, const uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(TypedWriteInstruction, Type, getSrcType(void), getSrcType())
+DECL_MEM_FN(TypedWriteInstruction, Type, getCoordType(void), getCoordType())
+DECL_MEM_FN(TypedWriteInstruction, const uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
+DECL_MEM_FN(GetImageInfoInstruction, const uint8_t, getImageIndex(void), getImageIndex())
+
+#undef DECL_MEM_FN
+
+  Immediate LoadImmInstruction::getImmediate(void) const {
+    const Function &fn = this->getFunction();
+    return reinterpret_cast<const internal::LoadImmInstruction*>(this)->getImmediate(fn);
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // Implements the emission functions
+  ///////////////////////////////////////////////////////////////////////////
+
+  // For all unary functions with given opcode
+  Instruction ALU1(Opcode opcode, Type type, Register dst, Register src) {
+    return internal::UnaryInstruction(opcode, type, dst, src).convert();
+  }
+
+  // All unary functions
+#define DECL_EMIT_FUNCTION(NAME) \
+  Instruction NAME(Type type, Register dst, Register src) { \
+    return ALU1(OP_##NAME, type, dst, src);\
+  }
+
+  DECL_EMIT_FUNCTION(MOV)
+  DECL_EMIT_FUNCTION(FBH)
+  DECL_EMIT_FUNCTION(FBL)
+  DECL_EMIT_FUNCTION(COS)
+  DECL_EMIT_FUNCTION(SIN)
+  DECL_EMIT_FUNCTION(LOG)
+  DECL_EMIT_FUNCTION(SQR)
+  DECL_EMIT_FUNCTION(RSQ)
+  DECL_EMIT_FUNCTION(RNDD)
+  DECL_EMIT_FUNCTION(RNDE)
+  DECL_EMIT_FUNCTION(RNDU)
+  DECL_EMIT_FUNCTION(RNDZ)
+
+#undef DECL_EMIT_FUNCTION
+
+  // All binary functions
+#define DECL_EMIT_FUNCTION(NAME) \
+  Instruction NAME(Type type, Register dst,  Register src0, Register src1) { \
+    return internal::BinaryInstruction(OP_##NAME, type, dst, src0, src1).convert(); \
+  }
+
+  DECL_EMIT_FUNCTION(POW)
+  DECL_EMIT_FUNCTION(MUL)
+  DECL_EMIT_FUNCTION(ADD)
+  DECL_EMIT_FUNCTION(ADDSAT)
+  DECL_EMIT_FUNCTION(SUB)
+  DECL_EMIT_FUNCTION(SUBSAT)
+  DECL_EMIT_FUNCTION(MUL_HI)
+  DECL_EMIT_FUNCTION(I64_MUL_HI)
+  DECL_EMIT_FUNCTION(UPSAMPLE_SHORT)
+  DECL_EMIT_FUNCTION(UPSAMPLE_INT)
+  DECL_EMIT_FUNCTION(UPSAMPLE_LONG)
+  DECL_EMIT_FUNCTION(DIV)
+  DECL_EMIT_FUNCTION(REM)
+  DECL_EMIT_FUNCTION(SHL)
+  DECL_EMIT_FUNCTION(SHR)
+  DECL_EMIT_FUNCTION(ASR)
+  DECL_EMIT_FUNCTION(BSF)
+  DECL_EMIT_FUNCTION(BSB)
+  DECL_EMIT_FUNCTION(OR)
+  DECL_EMIT_FUNCTION(XOR)
+  DECL_EMIT_FUNCTION(AND)
+  DECL_EMIT_FUNCTION(HADD)
+  DECL_EMIT_FUNCTION(RHADD)
+  DECL_EMIT_FUNCTION(I64HADD)
+  DECL_EMIT_FUNCTION(I64RHADD)
+
+#undef DECL_EMIT_FUNCTION
+
+  // SEL
+  Instruction SEL(Type type, Register dst, Tuple src) {
+    return internal::SelectInstruction(type, dst, src).convert();
+  }
+
+  Instruction I64MADSAT(Type type, Register dst, Tuple src) {
+    return internal::TernaryInstruction(OP_I64MADSAT, type, dst, src).convert();
+  }
+
+  Instruction MAD(Type type, Register dst, Tuple src) {
+    return internal::TernaryInstruction(OP_MAD, type, dst, src).convert();
+  }
+  // All compare functions
+#define DECL_EMIT_FUNCTION(NAME) \
+  Instruction NAME(Type type, Register dst,  Register src0, Register src1) { \
+    const internal::CompareInstruction insn(OP_##NAME, type, dst, src0, src1); \
+    return insn.convert(); \
+  }
+
+  DECL_EMIT_FUNCTION(EQ)
+  DECL_EMIT_FUNCTION(NE)
+  DECL_EMIT_FUNCTION(LE)
+  DECL_EMIT_FUNCTION(LT)
+  DECL_EMIT_FUNCTION(GE)
+  DECL_EMIT_FUNCTION(GT)
+  DECL_EMIT_FUNCTION(ORD)
+
+#undef DECL_EMIT_FUNCTION
+
+  // BITCAST
+  Instruction BITCAST(Type dstType, Type srcType, Tuple dst, Tuple src, uint8_t dstNum, uint8_t srcNum) {
+    return internal::BitCastInstruction(dstType, srcType, dst, src, dstNum, srcNum).convert();
+  }
+
+  // CVT
+  Instruction CVT(Type dstType, Type srcType, Register dst, Register src) {
+    return internal::ConvertInstruction(OP_CVT, dstType, srcType, dst, src).convert();
+  }
+
+  // saturated convert
+  Instruction SAT_CVT(Type dstType, Type srcType, Register dst, Register src) {
+    return internal::ConvertInstruction(OP_SAT_CVT, dstType, srcType, dst, src).convert();
+  }
+
+  // CVT
+  Instruction F16TO32(Type dstType, Type srcType, Register dst, Register src) {
+    return internal::ConvertInstruction(OP_F16TO32, dstType, srcType, dst, src).convert();
+  }
+
+  // saturated convert
+  Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src) {
+    return internal::ConvertInstruction(OP_F32TO16, dstType, srcType, dst, src).convert();
+  }
+
+  // For all unary functions with given opcode
+  Instruction ATOMIC(AtomicOps atomicOp, Register dst, AddressSpace space, BTI bti, Tuple src) {
+    return internal::AtomicInstruction(atomicOp, dst, space, bti, src).convert();
+  }
+
+  // BRA
+  Instruction BRA(LabelIndex labelIndex) {
+    return internal::BranchInstruction(OP_BRA, labelIndex).convert();
+  }
+  Instruction BRA(LabelIndex labelIndex, Register pred) {
+    return internal::BranchInstruction(OP_BRA, labelIndex, pred).convert();
+  }
+
+  // RET
+  Instruction RET(void) {
+    return internal::BranchInstruction(OP_RET).convert();
+  }
+
+  // LOADI
+  Instruction LOADI(Type type, Register dst, ImmediateIndex value) {
+    return internal::LoadImmInstruction(type, dst, value).convert();
+  }
+
+  // LOAD and STORE
+#define DECL_EMIT_FUNCTION(NAME, CLASS) \
+  Instruction NAME(Type type, \
+                   Tuple tuple, \
+                   Register offset, \
+                   AddressSpace space, \
+                   uint32_t valueNum, \
+                   bool dwAligned, \
+                   BTI bti) \
+  { \
+    return internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,bti).convert(); \
+  }
+
+  DECL_EMIT_FUNCTION(LOAD, LoadInstruction)
+  DECL_EMIT_FUNCTION(STORE, StoreInstruction)
+
+#undef DECL_EMIT_FUNCTION
+
+  // FENCE
+  Instruction SYNC(uint32_t parameters) {
+    return internal::SyncInstruction(parameters).convert();
+  }
+
+  // LABEL
+  Instruction LABEL(LabelIndex labelIndex) {
+    return internal::LabelInstruction(labelIndex).convert();
+  }
+
+  // SAMPLE
+  Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset) {
+    return internal::SampleInstruction(imageIndex, dst, src, dstIsFloat, srcIsFloat, sampler, samplerOffset).convert();
+  }
+
+  Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, Type srcType, Type coordType) {
+    return internal::TypedWriteInstruction(imageIndex, src, srcType, coordType).convert();
+  }
+
+  Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg) {
+    return internal::GetImageInfoInstruction(infoType, dst, imageIndex, infoReg).convert();
+  }
+
+  std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
+    const Function &fn = insn.getFunction();
+    switch (insn.getOpcode()) {
+#define DECL_INSN(OPCODE, CLASS) \
+      case OP_##OPCODE: \
+        reinterpret_cast<const internal::CLASS&>(insn).out(out, fn); \
+        break;
+#include "instruction.hxx"
+#undef DECL_INSN
+      case OP_INVALID: NOT_SUPPORTED; break;
+    };
+    return out;
+  }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
new file mode 100644
index 0000000..a75a441
--- /dev/null
+++ b/backend/src/ir/instruction.hpp
@@ -0,0 +1,687 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file instruction.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_INSTRUCTION_HPP__
+#define __GBE_IR_INSTRUCTION_HPP__
+
+#include "ir/register.hpp"
+#include "ir/immediate.hpp"
+#include "ir/type.hpp"
+#include "sys/platform.hpp"
+#include "sys/intrusive_list.hpp"
+
+#include <ostream>
+#define MAX_MIXED_POINTER 4
+
+namespace gbe {
+namespace ir {
+  struct BTI {
+    uint8_t bti[MAX_MIXED_POINTER];
+    uint8_t count;
+    BTI() : count(0) {
+      memset(bti, 0, MAX_MIXED_POINTER);
+    }
+    ~BTI() {}
+  };
+
+  /*! All opcodes */
+  enum Opcode : uint8_t {
+#define DECL_INSN(INSN, FAMILY) OP_##INSN,
+#include "ir/instruction.hxx"
+#undef DECL_INSN
+    OP_INVALID
+  };
+
+  /*! Different memory spaces */
+  enum AddressSpace : uint8_t {
+    MEM_GLOBAL = 0, //!< Global memory (a la OCL)
+    MEM_LOCAL,      //!< Local memory (thread group memory)
+    MEM_CONSTANT,   //!< Immutable global memory
+    MEM_PRIVATE,    //!< Per thread private memory
+    IMAGE,          //!< For texture image.
+    MEM_INVALID
+  };
+
+  enum AtomicOps {
+    ATOMIC_OP_AND       = 1,
+    ATOMIC_OP_OR        = 2,
+    ATOMIC_OP_XOR       = 3,
+    ATOMIC_OP_XCHG      = 4,
+    ATOMIC_OP_INC       = 5,
+    ATOMIC_OP_DEC       = 6,
+    ATOMIC_OP_ADD       = 7,
+    ATOMIC_OP_SUB       = 8,
+    ATOMIC_OP_IMAX      = 10,
+    ATOMIC_OP_IMIN      = 11,
+    ATOMIC_OP_UMAX      = 12,
+    ATOMIC_OP_UMIN      = 13,
+    ATOMIC_OP_CMPXCHG   = 14,
+    ATOMIC_OP_INVALID
+  };
+
+  /* Vote function per hardware thread */
+  enum VotePredicate : uint8_t {
+    VOTE_ALL = 0,
+    VOTE_ANY
+  };
+
+  /*! Output the memory space */
+  std::ostream &operator<< (std::ostream &out, AddressSpace addrSpace);
+
+  /*! A label is identified with an unsigned short */
+  TYPE_SAFE(LabelIndex, uint16_t)
+
+  /*! Function class contains the register file and the register tuple. Any
+   *  information related to the registers may therefore require a function
+   */
+  class Function;
+
+  /*! Contains the stream of instructions */
+  class BasicBlock;
+
+  ///////////////////////////////////////////////////////////////////////////
+  /// All public instruction classes as manipulated by all public classes
+  ///////////////////////////////////////////////////////////////////////////
+
+  /*! Stores instruction internal data and opcode */
+  class ALIGNED(sizeof(uint64_t)*2) InstructionBase
+  {
+  public:
+    /*! Initialize the instruction from a 8 bytes stream */
+    INLINE InstructionBase(const char *stream) {
+      opcode = Opcode(stream[0]);
+      for (uint32_t byte = 0; byte < opaqueSize; ++byte)
+        opaque[byte] = stream[byte+1];
+    }
+    /*! Uninitialized instruction */
+    INLINE InstructionBase(void) {}
+    /*! Get the instruction opcode */
+    INLINE Opcode getOpcode(void) const { return opcode; }
+  protected:
+    enum { opaqueSize = sizeof(uint64_t)*2-sizeof(uint8_t) };
+    Opcode opcode;               //!< Idendifies the instruction
+    char opaque[opaqueSize];     //!< Remainder of it
+    GBE_CLASS(InstructionBase);  //!< Use internal allocators
+  };
+
+  /*! Store the instruction description in 32 bytes */
+  class Instruction : public InstructionBase, public intrusive_list_node
+  {
+  public:
+    /*! Initialize the instruction from a 8 bytes stream */
+    INLINE Instruction(const char *stream) : InstructionBase(stream) {
+      parent = NULL;
+    }
+    /*! Copy the private fields and give it the same parent */
+    INLINE Instruction(const Instruction &other) :
+      InstructionBase(reinterpret_cast<const char*>(&other.opcode)) {
+      parent = other.parent;
+    }
+  private:
+    /*! To be consistant with copy constructor */
+    INLINE Instruction &operator= (const Instruction &other) { return *this; }
+  public:
+    /*! Nothing to do here */
+    INLINE ~Instruction(void) {}
+    /*! Uninitialized instruction */
+    INLINE Instruction(void) {}
+    /*! Get the number of sources for this instruction  */
+    uint32_t getSrcNum(void) const;
+    /*! Get the number of destination for this instruction */
+    uint32_t getDstNum(void) const;
+    /*! Get the register index of the given source */
+    Register getSrc(uint32_t ID = 0u) const;
+    /*! Get the register index of the given destination */
+    Register getDst(uint32_t ID = 0u) const;
+    /*! Get the register of the given source */
+    RegisterData getDstData(uint32_t ID = 0u) const;
+    /*! Get the register of the given destination */
+    RegisterData getSrcData(uint32_t ID = 0u) const;
+    /*! Set a register in src srcID */
+    void setSrc(uint32_t srcID, Register reg);
+    /*! Set a register in dst dstID */
+    void setDst(uint32_t dstID, Register reg);
+    /*! Is there any side effect in the memory sub-system? */
+    bool hasSideEffect(void) const;
+    /*! Get / set the parent basic block */
+    BasicBlock *getParent(void) { return parent; }
+    const BasicBlock *getParent(void) const { return parent; }
+    void setParent(BasicBlock *block) { this->parent = block; }
+    /*! Get the function from the parent basic block */
+    const Function &getFunction(void) const;
+    Function &getFunction(void);
+    /*! Check that the instruction is well formed (type properly match,
+     *  registers not of bound and so on). If not well formed, provide a reason
+     *  in string why
+     */
+    bool wellFormed(std::string &why) const;
+    /*! Replace other by this instruction */
+    void replace(Instruction *other) const;
+    /*! Remove the instruction from the instruction stream */
+    void remove(void);
+    /* Insert the instruction after the previous one. */
+    void insert(Instruction *prev, Instruction ** new_ins = NULL);
+    /*! Indicates if the instruction belongs to instruction type T. Typically, T
+     *  can be BinaryInstruction, UnaryInstruction, LoadInstruction and so on
+     */
+    template <typename T> INLINE bool isMemberOf(void) const {
+      return T::isClassOf(*this);
+    }
+    /*! max_src for store instruction (vec16 + addr) */
+    static const uint32_t MAX_SRC_NUM = 17;
+    static const uint32_t MAX_DST_NUM = 16;
+  protected:
+    BasicBlock *parent;      //!< The basic block containing the instruction
+    GBE_CLASS(Instruction);  //!< Use internal allocators
+  };
+
+  /*! Output the instruction string in the given stream */
+  std::ostream &operator<< (std::ostream &out, const Instruction &proxy);
+
+  /*! Unary instructions are typed. dst and sources share the same type */
+  class UnaryInstruction : public Instruction {
+  public:
+    /*! Get the type manipulated by the instruction */
+    Type getType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Binary instructions are typed. dst and sources share the same type */
+  class BinaryInstruction : public Instruction {
+  public:
+    /*! Get the type manipulated by the instruction */
+    Type getType(void) const;
+    /*! Commutative instructions can allow better optimizations */
+    bool commutes(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Ternary instructions are typed. dst and sources share the same type */
+  class TernaryInstruction : public Instruction {
+   public:
+    Type getType(void) const;
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Select instructions writes src0 to dst if cond is true. Otherwise, it
+   *  writes src1
+   */
+  class SelectInstruction : public Instruction {
+  public:
+    /*! Predicate is in slot 0. So first source to selec is in slot 1 */
+    static const uint32_t src0Index = 1;
+    /*! Second source to select is in slot 2 */
+    static const uint32_t src1Index = 2;
+    /*! Get the predicate of the selection instruction */
+    INLINE Register getPredicate(void) const { return this->getSrc(0); }
+    /*! Get the type of both sources */
+    Type getType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Compare instructions compare anything from the same type and return a
+   *  boolean value
+   */
+  class CompareInstruction : public Instruction {
+  public:
+    /*! Get the type of the source registers */
+    Type getType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! BitCast instruction converts from one type to another */
+  class BitCastInstruction : public Instruction {
+  public:
+    /*! Get the type of the source */
+    Type getSrcType(void) const;
+    /*! Get the type of the destination */
+    Type getDstType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Conversion instruction converts from one type to another */
+  class ConvertInstruction : public Instruction {
+  public:
+    /*! Get the type of the source */
+    Type getSrcType(void) const;
+    /*! Get the type of the destination */
+    Type getDstType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Atomic instruction */
+  class AtomicInstruction : public Instruction {
+  public:
+    /*! Where the address register goes */
+    static const uint32_t addressIndex = 0;
+    /*! Address space that is manipulated here */
+    AddressSpace getAddressSpace(void) const;
+    BTI getBTI(void) const;
+    /*! Return the atomic function code */
+    AtomicOps getAtomicOpcode(void) const;
+    /*! Return the register that contains the addresses */
+    INLINE Register getAddress(void) const { return this->getSrc(addressIndex); }
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Store instruction. First source is the address. Next sources are the
+   *  values to store contiguously at the given address
+   */
+  class StoreInstruction : public Instruction {
+  public:
+    /*! Where the address register goes */
+    static const uint32_t addressIndex = 0;
+    /*! Return the types of the values to store */
+    Type getValueType(void) const;
+    /*! Give the number of values the instruction is storing (srcNum-1) */
+    uint32_t getValueNum(void) const;
+    BTI getBTI(void) const;
+    /*! Address space that is manipulated here */
+    AddressSpace getAddressSpace(void) const;
+    /*! DWORD aligned means untyped read for Gen. That is what matters */
+    bool isAligned(void) const;
+    /*! Return the register that contains the addresses */
+    INLINE Register getAddress(void) const { return this->getSrc(addressIndex); }
+    /*! Return the register that contain value valueID */
+    INLINE Register getValue(uint32_t valueID) const {
+      GBE_ASSERT(valueID < this->getValueNum());
+      return this->getSrc(valueID + 1u);
+    }
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Load instruction. The source is simply the address where to get the data.
+   *  The multiple destinations are the contiguous values loaded at the given
+   *  address
+   */
+  class LoadInstruction : public Instruction {
+  public:
+    /*! Type of the loaded values (ie type of all the destinations) */
+    Type getValueType(void) const;
+    /*! Number of values loaded (ie number of destinations) */
+    uint32_t getValueNum(void) const;
+    /*! Address space that is manipulated here */
+    AddressSpace getAddressSpace(void) const;
+    /*! DWORD aligned means untyped read for Gen. That is what matters */
+    bool isAligned(void) const;
+    /*! Return the register that contains the addresses */
+    INLINE Register getAddress(void) const { return this->getSrc(0u); }
+    BTI getBTI(void) const;
+    /*! Return the register that contain value valueID */
+    INLINE Register getValue(uint32_t valueID) const {
+      return this->getDst(valueID);
+    }
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Load immediate instruction loads an typed immediate value into the given
+   *  register. Since double and uint64_t values will not fit into an
+   *  instruction, the immediate themselves are stored in the function core.
+   *  Contrary to regular load instructions, there is only one destination
+   *  possible
+   */
+  class LoadImmInstruction : public Instruction {
+  public:
+    /*! Return the value stored in the instruction */
+    Immediate getImmediate(void) const;
+    /*! Return the type of the stored value */
+    Type getType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Store data in an texture */
+  class TypedWriteInstruction : public Instruction {
+  public:
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+    const uint8_t getImageIndex() const;
+    Type getSrcType(void) const;
+    Type getCoordType(void) const;
+  };
+
+  /*! Load texels from a texture */
+  class SampleInstruction : public Instruction {
+  public:
+    const uint8_t getImageIndex() const;
+    const uint8_t getSamplerIndex(void) const;
+    const uint8_t getSamplerOffset(void) const;
+    Type getSrcType(void) const;
+    Type getDstType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  typedef union _ImageInfoKey{
+    _ImageInfoKey(uint8_t i, uint8_t t) : index(i), type(t) {};
+    struct {
+     uint8_t index; /*! the allocated image index */
+     uint8_t  type;  /*! the information type */
+    };
+    uint16_t data;
+  } ImageInfoKey;
+
+  /*! Get image information */
+  class GetImageInfoInstruction : public Instruction {
+  public:
+    enum {
+     WIDTH = 0,
+     HEIGHT = 1,
+     DEPTH = 2,
+     CHANNEL_DATA_TYPE = 3,
+     CHANNEL_ORDER = 4,
+    };
+
+    static INLINE uint32_t getDstNum4Type(int infoType) {
+      switch (infoType) {
+        case WIDTH:
+        case HEIGHT:
+        case DEPTH:
+        case CHANNEL_DATA_TYPE:
+        case CHANNEL_ORDER:
+          return 1;
+        break;
+        default:
+          GBE_ASSERT(0);
+     }
+     return 0;
+   }
+
+    const uint8_t getImageIndex() const;
+    uint32_t getInfoType() const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Branch instruction is the unified way to branch (with or without
+   *  predicate)
+   */
+  class BranchInstruction : public Instruction {
+  public:
+    /*! Indicate if the branch is predicated */
+    bool isPredicated(void) const;
+    /*! Return the predicate register (if predicated) */
+    RegisterData getPredicate(void) const {
+      GBE_ASSERTM(this->isPredicated() == true, "Branch is not predicated");
+      return this->getSrcData(0);
+    }
+    /*! Return the predicate register index (if predicated) */
+    Register getPredicateIndex(void) const {
+      GBE_ASSERTM(this->isPredicated() == true, "Branch is not predicated");
+      return this->getSrc(0);
+    }
+    /*! Return the label index pointed by the branch */
+    LabelIndex getLabelIndex(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Label instruction are actual no-op but are referenced by branches as their
+   *  targets
+   */
+  class LabelInstruction : public Instruction {
+  public:
+    /*! Return the label index of the instruction */
+    LabelIndex getLabelIndex(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Texture instruction are used for any texture mapping requests */
+  class TextureInstruction : public Instruction {
+  public:
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Mapped to OpenCL (mem_fence, read_mem_fence, write_mem_fence, barrier) */
+  enum {
+    SYNC_WORKGROUP_EXEC     = 1<<0,
+    SYNC_LOCAL_READ_FENCE   = 1<<1,
+    SYNC_LOCAL_WRITE_FENCE  = 1<<2,
+    SYNC_GLOBAL_READ_FENCE  = 1<<3,
+    SYNC_GLOBAL_WRITE_FENCE = 1<<4,
+    SYNC_INVALID            = 1<<5
+  };
+
+  /*! 5 bits to encode all possible synchronization capablities */
+  static const uint32_t syncFieldNum = 5u;
+
+  /*! When barrier(CLK_LOCAL_MEM_FENCE) is issued */
+  static const uint32_t syncLocalBarrier = SYNC_WORKGROUP_EXEC |SYNC_LOCAL_WRITE_FENCE | SYNC_LOCAL_READ_FENCE;
+
+  /*! When barrier(CLK_GLOBAL_MEM_FENCE) is issued */
+  static const uint32_t syncGlobalBarrier = SYNC_WORKGROUP_EXEC | SYNC_GLOBAL_WRITE_FENCE | SYNC_GLOBAL_READ_FENCE;
+
+  /*! Sync instructions are used to order loads and stores for a given memory
+   *  space and/or to serialize threads at a given point in the program
+   */
+  class SyncInstruction : public Instruction {
+  public:
+    /*! Get the parameters (bitfields) of the sync instructions (see above) */
+    uint32_t getParameters(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! Specialize the instruction. Also performs typechecking first based on the
+   *  opcode. Crashes if it fails
+   */
+  template <typename T>
+  INLINE T *cast(Instruction *insn) {
+    if(insn->isMemberOf<T>())
+      return reinterpret_cast<T*>(insn);
+    else
+      return NULL;
+  }
+  template <typename T>
+  INLINE const T *cast(const Instruction *insn) {
+    if(insn->isMemberOf<T>())
+      return reinterpret_cast<const T*>(insn);
+    else
+      return NULL;
+  }
+  template <typename T>
+  INLINE T &cast(Instruction &insn) {
+    GBE_ASSERTM(insn.isMemberOf<T>() == true, "Invalid instruction type");
+    return reinterpret_cast<T&>(insn);
+  }
+  template <typename T>
+  INLINE const T &cast(const Instruction &insn) {
+    GBE_ASSERTM(insn.isMemberOf<T>() == true, "Invalid instruction type");
+    return reinterpret_cast<const T&>(insn);
+  }
+
+  /*! Indicates if the given opcode belongs the given instruction family */
+  template <typename T, typename U> struct EqualType {enum {value = false};};
+  template <typename T> struct EqualType<T,T> { enum {value = true};};
+  template <typename T>
+  INLINE bool isOpcodeFrom(Opcode op) {
+    switch (op) {
+#define DECL_INSN(OPCODE, FAMILY) \
+      case OP_##OPCODE: return EqualType<T, FAMILY>::value;
+#include "instruction.hxx"
+#undef DECL_INSN
+      default: NOT_SUPPORTED; return false;
+    }
+  }
+
+  ///////////////////////////////////////////////////////////////////////////
+  /// All emission functions
+  ///////////////////////////////////////////////////////////////////////////
+
+  /*! alu1.type dst src */
+  Instruction ALU1(Opcode opcode, Type type, Register dst, Register src);
+  /*! mov.type dst src */
+  Instruction MOV(Type type, Register dst, Register src);
+  /*! cos.type dst src */
+  Instruction COS(Type type, Register dst, Register src);
+  /*! sin.type dst src */
+  Instruction SIN(Type type, Register dst, Register src);
+  /*! mul_hi.type dst src */
+  Instruction MUL_HI(Type type, Register dst, Register src0, Register src1);
+  /*! i64_mul_hi.type dst src */
+  Instruction I64_MUL_HI(Type type, Register dst, Register src0, Register src1);
+  /*! i64madsat.type dst src */
+  Instruction I64MADSAT(Type type, Register dst, Tuple src);
+  /*! mad.type dst src */
+  Instruction MAD(Type type, Register dst, Tuple src);
+  /*! upsample_short.type dst src */
+  Instruction UPSAMPLE_SHORT(Type type, Register dst, Register src0, Register src1);
+  /*! upsample_int.type dst src */
+  Instruction UPSAMPLE_INT(Type type, Register dst, Register src0, Register src1);
+  /*! upsample_long.type dst src */
+  Instruction UPSAMPLE_LONG(Type type, Register dst, Register src0, Register src1);
+  /*! fbh.type dst src */
+  Instruction FBH(Type type, Register dst, Register src);
+  /*! fbl.type dst src */
+  Instruction FBL(Type type, Register dst, Register src);
+  /*! hadd.type dst src */
+  Instruction HADD(Type type, Register dst, Register src0, Register src1);
+  /*! rhadd.type dst src */
+  Instruction RHADD(Type type, Register dst, Register src0, Register src1);
+  /*! i64hadd.type dst src */
+  Instruction I64HADD(Type type, Register dst, Register src0, Register src1);
+  /*! i64rhadd.type dst src */
+  Instruction I64RHADD(Type type, Register dst, Register src0, Register src1);
+  /*! tan.type dst src */
+  Instruction RCP(Type type, Register dst, Register src);
+  /*! abs.type dst src */
+  Instruction ABS(Type type, Register dst, Register src);
+  /*! simd_all.type dst src */
+  Instruction SIMD_ALL(Type type, Register dst, Register src);
+  /*! simd_any.type dst src */
+  Instruction SIMD_ANY(Type type, Register dst, Register src);
+  /*! log.type dst src */
+  Instruction LOG(Type type, Register dst, Register src);
+  /*! exp.type dst src */
+  Instruction EXP(Type type, Register dst, Register src);
+  /*! sqr.type dst src */
+  Instruction SQR(Type type, Register dst, Register src);
+  /*! rsq.type dst src */
+  Instruction RSQ(Type type, Register dst, Register src);
+  /*! rndd.type dst src */
+  Instruction RNDD(Type type, Register dst, Register src);
+  /*! rnde.type dst src */
+  Instruction RNDE(Type type, Register dst, Register src);
+  /*! rndu.type dst src */
+  Instruction RNDU(Type type, Register dst, Register src);
+  /*! rndz.type dst src */
+  Instruction RNDZ(Type type, Register dst, Register src);
+  /*! pow.type dst src0 src1 */
+  Instruction POW(Type type, Register dst, Register src0, Register src1);
+  /*! mul.type dst src0 src1 */
+  Instruction MUL(Type type, Register dst, Register src0, Register src1);
+  /*! add.type dst src0 src1 */
+  Instruction ADD(Type type, Register dst, Register src0, Register src1);
+  /*! addsat.type dst src0 src1 */
+  Instruction ADDSAT(Type type, Register dst, Register src0, Register src1);
+  /*! sub.type dst src0 src1 */
+  Instruction SUB(Type type, Register dst, Register src0, Register src1);
+  /*! subsat.type dst src0 src1 */
+  Instruction SUBSAT(Type type, Register dst, Register src0, Register src1);
+  /*! div.type dst src0 src1 */
+  Instruction DIV(Type type, Register dst, Register src0, Register src1);
+  /*! rem.type dst src0 src1 */
+  Instruction REM(Type type, Register dst, Register src0, Register src1);
+  /*! shl.type dst src0 src1 */
+  Instruction SHL(Type type, Register dst, Register src0, Register src1);
+  /*! shr.type dst src0 src1 */
+  Instruction SHR(Type type, Register dst, Register src0, Register src1);
+  /*! asr.type dst src0 src1 */
+  Instruction ASR(Type type, Register dst, Register src0, Register src1);
+  /*! bsf.type dst src0 src1 */
+  Instruction BSF(Type type, Register dst, Register src0, Register src1);
+  /*! bsb.type dst src0 src1 */
+  Instruction BSB(Type type, Register dst, Register src0, Register src1);
+  /*! or.type dst src0 src1 */
+  Instruction OR(Type type, Register dst, Register src0, Register src1);
+  /*! xor.type dst src0 src1 */
+  Instruction XOR(Type type, Register dst, Register src0, Register src1);
+  /*! and.type dst src0 src1 */
+  Instruction AND(Type type, Register dst, Register src0, Register src1);
+  /*! sel.type dst {cond, src0, src1} (== src) */
+  Instruction SEL(Type type, Register dst, Tuple src);
+  /*! eq.type dst src0 src1 */
+  Instruction EQ(Type type, Register dst, Register src0, Register src1);
+  /*! ne.type dst src0 src1 */
+  Instruction NE(Type type, Register dst, Register src0, Register src1);
+  /*! lt.type dst src0 src1 */
+  Instruction LE(Type type, Register dst, Register src0, Register src1);
+  /*! le.type dst src0 src1 */
+  Instruction LT(Type type, Register dst, Register src0, Register src1);
+  /*! gt.type dst src0 src1 */
+  Instruction GE(Type type, Register dst, Register src0, Register src1);
+  /*! ge.type dst src0 src1 */
+  Instruction GT(Type type, Register dst, Register src0, Register src1);
+  /*! ord.type dst src0 src1 */
+  Instruction ORD(Type type, Register dst, Register src0, Register src1);
+  /*! BITCAST.{dstType <- srcType} dst src */
+  Instruction BITCAST(Type dstType, Type srcType, Tuple dst, Tuple src, uint8_t dstNum, uint8_t srcNum);
+  /*! cvt.{dstType <- srcType} dst src */
+  Instruction CVT(Type dstType, Type srcType, Register dst, Register src);
+  /*! sat_cvt.{dstType <- srcType} dst src */
+  Instruction SAT_CVT(Type dstType, Type srcType, Register dst, Register src);
+  /*! F16TO32.{dstType <- srcType} dst src */
+  Instruction F16TO32(Type dstType, Type srcType, Register dst, Register src);
+  /*! F32TO16.{dstType <- srcType} dst src */
+  Instruction F32TO16(Type dstType, Type srcType, Register dst, Register src);
+  /*! atomic dst addr.space {src1 {src2}} */
+  Instruction ATOMIC(AtomicOps opcode, Register dst, AddressSpace space, BTI bti, Tuple src);
+  /*! bra labelIndex */
+  Instruction BRA(LabelIndex labelIndex);
+  /*! (pred) bra labelIndex */
+  Instruction BRA(LabelIndex labelIndex, Register pred);
+  /*! ret */
+  Instruction RET(void);
+  /*! load.type.space {dst1,...,dst_valueNum} offset value */
+  Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
+  /*! store.type.space offset {src1,...,src_valueNum} value */
+  Instruction STORE(Type type, Tuple src, Register offset, AddressSpace space, uint32_t valueNum, bool dwAligned, BTI bti);
+  /*! loadi.type dst value */
+  Instruction LOADI(Type type, Register dst, ImmediateIndex value);
+  /*! sync.params... (see Sync instruction) */
+  Instruction SYNC(uint32_t parameters);
+  /*! typed write */
+  Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, Type srcType, Type coordType);
+  /*! sample textures */
+  Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset);
+  /*! get image information , such as width/height/depth/... */
+  Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg);
+  /*! label labelIndex */
+  Instruction LABEL(LabelIndex labelIndex);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_INSTRUCTION_HPP__ */
+
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
new file mode 100644
index 0000000..587517b
--- /dev/null
+++ b/backend/src/ir/instruction.hxx
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file instruction.hxx
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+DECL_INSN(MOV, UnaryInstruction)
+DECL_INSN(COS, UnaryInstruction)
+DECL_INSN(SIN, UnaryInstruction)
+DECL_INSN(LOG, UnaryInstruction)
+DECL_INSN(EXP, UnaryInstruction)
+DECL_INSN(SQR, UnaryInstruction)
+DECL_INSN(RSQ, UnaryInstruction)
+DECL_INSN(RCP, UnaryInstruction)
+DECL_INSN(ABS, UnaryInstruction)
+DECL_INSN(RNDD, UnaryInstruction)
+DECL_INSN(RNDE, UnaryInstruction)
+DECL_INSN(RNDU, UnaryInstruction)
+DECL_INSN(RNDZ, UnaryInstruction)
+DECL_INSN(SIMD_ANY, UnaryInstruction)
+DECL_INSN(SIMD_ALL, UnaryInstruction)
+DECL_INSN(POW, BinaryInstruction)
+DECL_INSN(MUL, BinaryInstruction)
+DECL_INSN(ADD, BinaryInstruction)
+DECL_INSN(ADDSAT, BinaryInstruction)
+DECL_INSN(SUB, BinaryInstruction)
+DECL_INSN(SUBSAT, BinaryInstruction)
+DECL_INSN(DIV, BinaryInstruction)
+DECL_INSN(REM, BinaryInstruction)
+DECL_INSN(SHL, BinaryInstruction)
+DECL_INSN(SHR, BinaryInstruction)
+DECL_INSN(ASR, BinaryInstruction)
+DECL_INSN(BSF, BinaryInstruction)
+DECL_INSN(BSB, BinaryInstruction)
+DECL_INSN(OR, BinaryInstruction)
+DECL_INSN(XOR, BinaryInstruction)
+DECL_INSN(AND, BinaryInstruction)
+DECL_INSN(SEL, SelectInstruction)
+DECL_INSN(EQ, CompareInstruction)
+DECL_INSN(NE, CompareInstruction)
+DECL_INSN(LE, CompareInstruction)
+DECL_INSN(LT, CompareInstruction)
+DECL_INSN(GE, CompareInstruction)
+DECL_INSN(GT, CompareInstruction)
+DECL_INSN(ORD, CompareInstruction)
+DECL_INSN(BITCAST, BitCastInstruction)
+DECL_INSN(CVT, ConvertInstruction)
+DECL_INSN(SAT_CVT, ConvertInstruction)
+DECL_INSN(F16TO32, ConvertInstruction)
+DECL_INSN(F32TO16, ConvertInstruction)
+DECL_INSN(ATOMIC, AtomicInstruction)
+DECL_INSN(BRA, BranchInstruction)
+DECL_INSN(RET, BranchInstruction)
+DECL_INSN(LOADI, LoadImmInstruction)
+DECL_INSN(LOAD, LoadInstruction)
+DECL_INSN(STORE, StoreInstruction)
+DECL_INSN(TYPED_WRITE, TypedWriteInstruction)
+DECL_INSN(SAMPLE, SampleInstruction)
+DECL_INSN(SYNC, SyncInstruction)
+DECL_INSN(LABEL, LabelInstruction)
+DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
+DECL_INSN(MUL_HI, BinaryInstruction)
+DECL_INSN(I64_MUL_HI, BinaryInstruction)
+DECL_INSN(FBH, UnaryInstruction)
+DECL_INSN(FBL, UnaryInstruction)
+DECL_INSN(HADD, BinaryInstruction)
+DECL_INSN(RHADD, BinaryInstruction)
+DECL_INSN(I64HADD, BinaryInstruction)
+DECL_INSN(I64RHADD, BinaryInstruction)
+DECL_INSN(UPSAMPLE_SHORT, BinaryInstruction)
+DECL_INSN(UPSAMPLE_INT, BinaryInstruction)
+DECL_INSN(UPSAMPLE_LONG, BinaryInstruction)
+DECL_INSN(I64MADSAT, TernaryInstruction)
+DECL_INSN(MAD, TernaryInstruction)
diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
new file mode 100644
index 0000000..afed476
--- /dev/null
+++ b/backend/src/ir/liveness.cpp
@@ -0,0 +1,240 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file liveness.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/liveness.hpp"
+#include <sstream>
+
+namespace gbe {
+namespace ir {
+
+  Liveness::Liveness(Function &fn) : fn(fn) {
+    // Initialize UEVar and VarKill for each block
+    fn.foreachBlock([this](const BasicBlock &bb) {
+      this->initBlock(bb);
+      // If the bb has ret instruction, add it to the work list set.
+      const Instruction *lastInsn = bb.getLastInstruction();
+      const ir::Opcode op = lastInsn->getOpcode();
+      struct BlockInfo * info = liveness[&bb];
+      if (op == OP_RET) {
+        workSet.insert(info);
+        info->liveOut.insert(ocl::retVal);
+      }
+    });
+    // Now with iterative analysis, we compute liveout and livein sets
+    this->computeLiveInOut();
+    // extend register (def in loop, use out-of-loop) liveness to the whole loop
+    set<Register> extentRegs;
+    this->computeExtraLiveInOut(extentRegs);
+    // analyze uniform values. The extentRegs contains all the values which is
+    // defined in a loop and use out-of-loop which could not be a uniform. The reason
+    // is that when it reenter the second time, it may active different lanes. So
+    // reenter many times may cause it has different values in different lanes.
+    this->analyzeUniform(&extentRegs);
+  }
+
+  Liveness::~Liveness(void) {
+    for (auto &pair : liveness) GBE_SAFE_DELETE(pair.second);
+  }
+
+  void Liveness::analyzeUniform(set<Register> *extentRegs) {
+    fn.foreachBlock([this, extentRegs](const BasicBlock &bb) {
+      const_cast<BasicBlock&>(bb).foreach([this, extentRegs](const Instruction &insn) {
+        const uint32_t srcNum = insn.getSrcNum();
+        const uint32_t dstNum = insn.getDstNum();
+        bool uniform = true;
+        for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+          const Register reg = insn.getSrc(srcID);
+          if (!fn.isUniformRegister(reg))
+            uniform = false;
+        }
+        // A destination is a killed value
+        for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+          const Register reg = insn.getDst(dstID);
+          int opCode = insn.getOpcode();
+          // FIXME, ADDSAT and uniform vector should be supported.
+          if (uniform &&
+              fn.getRegisterFamily(reg) != ir::FAMILY_QWORD &&
+              !insn.getParent()->definedPhiRegs.contains(reg) &&
+              opCode != ir::OP_ATOMIC &&
+              opCode != ir::OP_MUL_HI &&
+              opCode != ir::OP_HADD &&
+              opCode != ir::OP_RHADD &&
+              opCode != ir::OP_ADDSAT &&
+              (dstNum == 1 || insn.getOpcode() != ir::OP_LOAD) &&
+              !extentRegs->contains(reg)
+             )
+            fn.setRegisterUniform(reg, true);
+        }
+      });
+    });
+  }
+
+  void Liveness::initBlock(const BasicBlock &bb) {
+    GBE_ASSERT(liveness.contains(&bb) == false);
+    BlockInfo *info = GBE_NEW(BlockInfo, bb);
+    // Traverse all instructions to handle UEVar and VarKill
+    const_cast<BasicBlock&>(bb).foreach([this, info](const Instruction &insn) {
+      this->initInstruction(*info, insn);
+    });
+    liveness[&bb] = info;
+  }
+
+  void Liveness::initInstruction(BlockInfo &info, const Instruction &insn) {
+    const uint32_t srcNum = insn.getSrcNum();
+    const uint32_t dstNum = insn.getDstNum();
+    // First look for used before killed
+    for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+      const Register reg = insn.getSrc(srcID);
+      // Not killed -> it is really an upward use
+      if (info.varKill.contains(reg) == false)
+        info.upwardUsed.insert(reg);
+    }
+    // A destination is a killed value
+    for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+      const Register reg = insn.getDst(dstID);
+      info.varKill.insert(reg);
+    }
+  }
+
+// Use simple backward data flow analysis to solve the liveness problem.
+  void Liveness::computeLiveInOut(void) {
+    while(!workSet.empty()) {
+      auto currInfo = *workSet.begin();
+      workSet.erase(currInfo);
+      for (auto currOutVar : currInfo->liveOut)
+        if (!currInfo->varKill.contains(currOutVar))
+          currInfo->upwardUsed.insert(currOutVar);
+      bool isChanged = false;
+      for (auto prev : currInfo->bb.getPredecessorSet()) {
+        BlockInfo *prevInfo = liveness[prev];
+        for (auto currInVar : currInfo->upwardUsed) {
+          if (!prevInfo->bb.undefPhiRegs.contains(currInVar)) {
+            auto changed = prevInfo->liveOut.insert(currInVar);
+            if (changed.second) isChanged = true;
+          }
+        }
+        if (isChanged )
+          workSet.insert(prevInfo);
+      }
+    };
+#if 0
+    fn.foreachBlock([this](const BasicBlock &bb){
+      printf("label %d:\n", bb.getLabelIndex());
+      BlockInfo *info = liveness[&bb];
+      auto &outVarSet = info->liveOut;
+      auto &inVarSet = info->upwardUsed;
+      printf("\n\tin Lives: ");
+      for (auto inVar : inVarSet) {
+        printf("%d ", inVar);
+      }
+      printf("\n");
+      printf("\tout Lives: ");
+      for (auto outVar : outVarSet) {
+        printf("%d ", outVar);
+      }
+      printf("\n");
+
+    });
+#endif
+   }
+/*
+  As we run in SIMD mode with prediction mask to indicate active lanes.
+  If a vreg is defined in a loop, and there are som uses of the vreg out of the loop,
+  the define point may be run several times under *different* prediction mask.
+  For these kinds of vreg, we must extend the vreg liveness into the whole loop.
+  If we don't do this, it's liveness is killed before the def point inside loop.
+  If the vreg's corresponding physical reg is assigned to other vreg during the
+  killed period, and the instructions before kill point were re-executed with different prediction,
+  the inactive lanes of vreg maybe over-written. Then the out-of-loop use will got wrong data.
+*/
+  void Liveness::computeExtraLiveInOut(set<Register> &extentRegs) {
+    const vector<Loop *> &loops = fn.getLoops();
+    extentRegs.clear();
+    if(loops.size() == 0) return;
+
+    for (auto l : loops) {
+      for (auto x : l->exits) {
+        const BasicBlock &a = fn.getBlock(x.first);
+        const BasicBlock &b = fn.getBlock(x.second);
+        BlockInfo * exiting = liveness[&a];
+        BlockInfo * exit = liveness[&b];
+        std::vector<Register> toExtend;
+
+        if(b.getPredecessorSet().size() > 1) {
+          for (auto p : exit->upwardUsed)
+            toExtend.push_back(p);
+        } else {
+          std::set_intersection(exiting->liveOut.begin(), exiting->liveOut.end(), exit->upwardUsed.begin(), exit->upwardUsed.end(), std::back_inserter(toExtend));
+        }
+        if (toExtend.size() == 0) continue;
+        for(auto r : toExtend)
+          extentRegs.insert(r);
+        for (auto bb : l->bbs) {
+          BlockInfo * bI = liveness[&fn.getBlock(bb)];
+          for(auto r : toExtend) {
+            if(!bI->upwardUsed.contains(r))
+              bI->upwardUsed.insert(r);
+            bI->liveOut.insert(r);
+          }
+        }
+      }
+    }
+#if 0
+    fn.foreachBlock([this](const BasicBlock &bb){
+      printf("label %d:\n", bb.getLabelIndex());
+      BlockInfo *info = liveness[&bb];
+      auto &outVarSet = info->liveOut;
+      auto &inVarSet = info->upwardUsed;
+      printf("\n\tLive Ins: ");
+      for (auto inVar : inVarSet) {
+        printf("%d ", inVar);
+      }
+      printf("\n");
+      printf("\tLive outs: ");
+      for (auto outVar : outVarSet) {
+        printf("%d ", outVar);
+      }
+      printf("\n");
+
+    });
+#endif
+   }
+
+
+  /*! To pretty print the livfeness info */
+  static const uint32_t prettyInsnStrSize = 48;
+  static const uint32_t prettyRegStrSize = 5;
+
+  /*! Describe how the register is used */
+  static const uint32_t USE_NONE    = 0;
+  static const uint32_t USE_READ    = 1 << 0;
+  static const uint32_t USE_WRITTEN = 1 << 1;
+
+  enum UsePosition {
+    POS_BEFORE = 0,
+    POS_HERE = 1,
+    POS_AFTER = 2
+  };
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/liveness.hpp b/backend/src/ir/liveness.hpp
new file mode 100644
index 0000000..d55e00d
--- /dev/null
+++ b/backend/src/ir/liveness.hpp
@@ -0,0 +1,148 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file liveness.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_LIVENESS_HPP__
+#define __GBE_IR_LIVENESS_HPP__
+
+#include <list>
+#include "sys/map.hpp"
+#include "sys/set.hpp"
+#include "ir/register.hpp"
+#include "ir/function.hpp"
+
+namespace gbe {
+namespace ir {
+
+  // Liveness is computed per function
+  class Function;
+
+  /*! To choose the iteration direction, we either look at predecessors or
+   *  successors
+   */
+  enum DataFlowDirection {
+    DF_PRED = 0,
+    DF_SUCC = 1
+  };
+
+  /*! Compute liveness of each register */
+  class Liveness : public NonCopyable
+  {
+  public:
+    Liveness(Function &fn);
+    ~Liveness(void);
+    /*! Set of variables used upwards in the block (before a definition) */
+    typedef set<Register> UEVar;
+    /*! Set of variables alive at the exit of the block */
+    typedef set<Register> LiveOut;
+    /*! Set of variables actually killed in each block */
+    typedef set<Register> VarKill;
+    /*! Per-block info */
+    struct BlockInfo : public NonCopyable {
+      BlockInfo(const BasicBlock &bb) : bb(bb) {}
+      const BasicBlock &bb;
+      INLINE bool inUpwardUsed(Register reg) const {
+        return upwardUsed.contains(reg);
+      }
+      INLINE bool inLiveOut(Register reg) const {
+        return liveOut.contains(reg);
+      }
+      INLINE bool inVarKill(Register reg) const {
+        return varKill.contains(reg);
+      }
+      UEVar upwardUsed;
+      LiveOut liveOut;
+      VarKill varKill;
+    };
+    /*! Gives for each block the variables alive at entry / exit */
+    typedef map<const BasicBlock*, BlockInfo*> Info;
+    /*! Return the complete liveness info */
+    INLINE const Info &getLivenessInfo(void) const { return liveness; }
+    /*! Return the complete block info */
+    INLINE const BlockInfo &getBlockInfo(const BasicBlock *bb) const {
+      auto it = liveness.find(bb);
+      GBE_ASSERT(it != liveness.end() && it->second != NULL);
+      return *it->second;
+    }
+    /*! Get the set of registers alive at the end of the block */
+    const LiveOut &getLiveOut(const BasicBlock *bb) const {
+      const BlockInfo &info = this->getBlockInfo(bb);
+      return info.liveOut;
+    }
+    /*! Get the set of registers alive at the beginning of the block */
+    const UEVar &getLiveIn(const BasicBlock *bb) const {
+      const BlockInfo &info = this->getBlockInfo(bb);
+      return info.upwardUsed;
+    }
+
+    /*! Return the function the liveness was computed on */
+    INLINE const Function &getFunction(void) const { return fn; }
+    /*! Actually do something for each successor / predecessor of *all* blocks */
+    template <DataFlowDirection dir, typename T>
+    void foreach(const T &functor) {
+      // Iterate on all blocks
+      for (const auto &pair : liveness) {
+        BlockInfo &info = *pair.second;
+        const BasicBlock &bb = info.bb;
+        const BlockSet *set = NULL;
+        if (dir == DF_SUCC)
+          set = &bb.getSuccessorSet();
+        else
+          set = &bb.getPredecessorSet();
+        // Iterate over all successors
+        for (auto other : *set) {
+          auto otherInfo = liveness.find(other);
+          GBE_ASSERT(otherInfo != liveness.end() && otherInfo->second != NULL);
+          functor(info, *otherInfo->second);
+        }
+      }
+    }
+  private:
+    /*! Store the liveness of all blocks */
+    Info liveness;
+    /*! Compute the liveness for this function */
+    Function &fn;
+    /*! Initialize UEVar and VarKill per block */
+    void initBlock(const BasicBlock &bb);
+    /*! Initialize UEVar and VarKill per instruction */
+    void initInstruction(BlockInfo &info, const Instruction &insn);
+    /*! Now really compute LiveOut based on UEVar and VarKill */
+    void computeLiveInOut(void);
+    void computeExtraLiveInOut(set<Register> &extentRegs);
+    void analyzeUniform(set<Register> *extentRegs);
+    /*! Set of work list block which has exit(return) instruction */
+    typedef set <struct BlockInfo*> WorkSet;
+    WorkSet workSet;
+
+    /*! Use custom allocators */
+    GBE_CLASS(Liveness);
+
+  };
+
+  /*! Output a nice ASCII reprensation of the liveness */
+  std::ostream &operator<< (std::ostream &out, const Liveness &liveness);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_LIVENESS_HPP__ */
+
diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
new file mode 100644
index 0000000..f71fd72
--- /dev/null
+++ b/backend/src/ir/lowering.cpp
@@ -0,0 +1,396 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file lowering.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "ir/context.hpp"
+#include "ir/value.hpp"
+#include "ir/liveness.hpp"
+#include "sys/set.hpp"
+
+namespace gbe {
+namespace ir {
+
+  /*! Small helper class to lower return instructions */
+  class ContextReturn : public Context
+  {
+  public:
+    /*! Initialize a context dedicated to return instruction lowering */
+    ContextReturn(Unit &unit) : Context(unit) {
+      this->usedLabels = GBE_NEW_NO_ARG(vector<uint8_t>);
+    }
+    /*! Lower the return instruction to gotos for the given function */
+    void lower(const std::string &functionName);
+  };
+
+  void ContextReturn::lower(const std::string &functionName) {
+    if ((this->fn = unit.getFunction(functionName)) == NULL)
+      return;
+
+    // Append a new block at the end of the function with a return instruction:
+    // the only one we are going to have
+    this->bb = &this->fn->getBottomBlock();
+    const LabelIndex index = this->label();
+    this->LABEL(index);
+    const BasicBlock *lastBlock = this->bb;
+    this->RET();
+
+    // Now traverse all instructions and replace all returns by GOTO index
+    fn->foreachInstruction([&](Instruction &insn) {
+      if (insn.getParent() == lastBlock) return; // This is the last block
+      if (insn.getOpcode() != OP_RET) return;
+      const Instruction bra = ir::BRA(index);
+      bra.replace(&insn);
+    });
+  }
+
+  void lowerReturn(Unit &unit, const std::string &functionName) {
+    ContextReturn ctx(unit);
+    ctx.lower(functionName);
+  }
+
+  /*! Characterizes how the argument is used (directly read, indirectly read,
+   *  written)
+   */
+  enum ArgUse {
+    ARG_DIRECT_READ = 0,
+    ARG_INDIRECT_READ = 1,
+    ARG_WRITTEN = 2
+  };
+
+  /*! Just to book keep the sequence of instructions that directly load an input
+   *  argument
+   */
+  struct LoadAddImm {
+    Instruction *load;    //!< Load from the argument
+    Instruction *add;     //!< Can be NULL if we only have load(arg)
+    Instruction *loadImm; //!< Can also be NULL
+    uint64_t offset;      //!< Offset where to load in the structure
+    uint32_t argID;       //!< Associated function argument
+  };
+
+  /*! List of direct loads */
+  typedef vector<LoadAddImm> LoadAddImmSeq;
+
+  /*! Helper class to lower function arguments if required */
+  class FunctionArgumentLowerer : public Context
+  {
+  public:
+    /*! Build the helper structure */
+    FunctionArgumentLowerer(Unit &unit);
+    /*! Free everything we needed */
+    virtual ~FunctionArgumentLowerer(void);
+    /*! Perform all function arguments substitution if needed */
+    void lower(const std::string &name);
+    /*! Lower the given function argument accesses */
+    void lower(uint32_t argID);
+    /*! Build the constant push for the function */
+    void buildConstantPush(void);
+    /*! Inspect the given function argument to see how it is used. If this is
+     *  direct loads only, we also output the list of instructions used for each
+     *  load
+     */
+    ArgUse getArgUse(uint32_t argID);
+    /*! Recursively look if there is a store in the given use */
+    bool useStore(const ValueDef &def, set<const Instruction*> &visited);
+    /*! Look if the pointer use only load with immediate offsets */
+    bool matchLoadAddImm(uint32_t argID);
+    Liveness *liveness; //!< To compute the function graph
+    FunctionDAG *dag;   //!< Contains complete dependency information
+    LoadAddImmSeq seq;  //!< All the direct loads
+  };
+
+  INLINE uint64_t getOffsetFromImm(const Immediate &imm) {
+    switch (imm.getType()) {
+      // bit-cast these ones
+      case TYPE_DOUBLE:
+      case TYPE_FLOAT: NOT_SUPPORTED; return 0;
+      case TYPE_S64:
+      case TYPE_U64:
+      case TYPE_U32:
+      case TYPE_U16:
+      case TYPE_U8:
+      // sign extend these ones
+      case TYPE_S32:
+      case TYPE_S16:
+      case TYPE_S8: return imm.getIntegerValue();
+      case TYPE_BOOL:
+      case TYPE_HALF: NOT_SUPPORTED; return 0;
+      default:
+        GBE_ASSERT(0 && "Unsupported imm type.\n");
+    }
+    return 0;
+  }
+
+  bool matchLoad(Instruction *insn,
+                 Instruction *add,
+                 Instruction *loadImm,
+                 uint64_t offset,
+                 uint32_t argID,
+                 LoadAddImm &loadAddImm)
+  {
+    const Opcode opcode = insn->getOpcode();
+
+    if (opcode == OP_LOAD) {
+      LoadInstruction *load = cast<LoadInstruction>(insn);
+      if (load->getAddressSpace() != MEM_PRIVATE)
+        return false;
+      loadAddImm.load = insn;
+      loadAddImm.add = add;
+      loadAddImm.loadImm = loadImm;
+      loadAddImm.offset = offset;
+      loadAddImm.argID = argID;
+      return true;
+    } else
+      return false;
+  }
+
+
+  FunctionArgumentLowerer::FunctionArgumentLowerer(Unit &unit) :
+    Context(unit), liveness(NULL), dag(NULL) {}
+  FunctionArgumentLowerer::~FunctionArgumentLowerer(void) {
+    GBE_SAFE_DELETE(dag);
+    GBE_SAFE_DELETE(liveness);
+  }
+
+  void FunctionArgumentLowerer::lower(const std::string &functionName) {
+    if ((this->fn = unit.getFunction(functionName)) == NULL)
+      return;
+    GBE_SAFE_DELETE(dag);
+    GBE_SAFE_DELETE(liveness);
+    this->liveness = GBE_NEW(ir::Liveness, *fn);
+    this->dag = GBE_NEW(ir::FunctionDAG, *this->liveness);
+
+    // Process all structure arguments and find all the direct loads we can
+    // replace
+    const uint32_t argNum = fn->argNum();
+    for (uint32_t argID = 0; argID < argNum; ++argID) {
+      FunctionArgument &arg = fn->getArg(argID);
+      if (arg.type != FunctionArgument::STRUCTURE) continue;
+      this->lower(argID);
+    }
+
+    // Build the constant push description and remove the instruction that
+    // therefore become useless
+    this->buildConstantPush();
+  }
+
+// Remove all the given instructions from the stream (if dead)
+#define REMOVE_INSN(WHICH) \
+  for (const auto &loadAddImm : seq) { \
+    Instruction *WHICH = loadAddImm.WHICH; \
+    if (WHICH == NULL) continue; \
+    const UseSet &useSet = dag->getUse(WHICH, 0); \
+    bool isDead = true; \
+    for (auto use : useSet) { \
+      if (dead.contains(use->getInstruction()) == false) { \
+        isDead = false; \
+        break; \
+      } \
+    } \
+    if (isDead && !dead.contains(WHICH)) { \
+      dead.insert(WHICH); \
+      WHICH->remove(); \
+    } \
+  }
+
+  void FunctionArgumentLowerer::buildConstantPush(void)
+  {
+    if (seq.size() == 0)
+      return;
+
+    // Track instructions we remove to recursively kill them properly
+    set<const Instruction*> dead;
+
+    // The argument location we already pushed (since the same argument location
+    // can be used several times)
+    set<PushLocation> inserted;
+    for (const auto &loadAddImm : seq) {
+      LoadInstruction *load = cast<LoadInstruction>(loadAddImm.load);
+      const uint32_t valueNum = load->getValueNum();
+      bool replaced = false;
+      Instruction *ins_after = load; // the instruction to insert after.
+      for (uint32_t valueID = 0; valueID < valueNum; ++valueID) {
+        const Type type = load->getValueType();
+        const RegisterFamily family = getFamily(type);
+        const uint32_t size = getFamilySize(family);
+        const uint32_t offset = loadAddImm.offset + valueID * size;
+        const PushLocation argLocation(*fn, loadAddImm.argID, offset);
+        Register pushed;
+        const Register reg = load->getValue(valueID);
+        if (offset != 0) {
+          if(inserted.contains(argLocation)) {
+            pushed = argLocation.getRegister();
+          } else {
+            // pushed register should be uniform register.
+            pushed = fn->newRegister(family, true);
+            this->appendPushedConstant(pushed, argLocation);
+            inserted.insert(argLocation);
+          }
+        } else {
+          pushed = fn->getArg(loadAddImm.argID).reg;
+        }
+
+        // TODO the MOV instruction can be most of the time avoided if the
+        // register is never written. We must however support the register
+        // replacement in the instruction interface to be able to patch all the
+        // instruction that uses "reg"
+        Instruction mov = ir::MOV(type, reg, pushed);
+        mov.insert(ins_after, &ins_after);
+        replaced = true;
+      }
+
+      if (replaced)
+        dead.insert(load);
+    }
+
+    REMOVE_INSN(load)
+    REMOVE_INSN(add)
+    REMOVE_INSN(loadImm)
+  }
+
+#undef REMOVE_INSN
+
+  bool FunctionArgumentLowerer::useStore(const ValueDef &def, set<const Instruction*> &visited)
+  {
+    const UseSet &useSet = dag->getUse(def);
+    for (const auto &use : useSet) {
+      const Instruction *insn = use->getInstruction();
+      const uint32_t srcID = use->getSrcID();
+      const Opcode opcode = insn->getOpcode();
+      if (visited.contains(insn)) continue;
+      visited.insert(insn);
+      if (opcode == OP_STORE && srcID == StoreInstruction::addressIndex)
+        return true;
+      if (insn->isMemberOf<UnaryInstruction>() == false &&
+          insn->isMemberOf<BinaryInstruction>() == false)
+        continue;
+      else {
+        const uint32_t dstNum = insn->getDstNum();
+        for (uint32_t dstID = 0; dstID < dstNum; ++dstID)
+          if (this->useStore(ValueDef(insn, dstID), visited) == true)
+            return true;
+      }
+    }
+    return false;
+  }
+
+  bool FunctionArgumentLowerer::matchLoadAddImm(uint32_t argID)
+  {
+    const FunctionArgument &arg = fn->getArg(argID);
+    LoadAddImmSeq tmpSeq;
+
+    // Inspect all uses of the function argument pointer
+    const UseSet &useSet = dag->getUse(&arg);
+    for (auto use : useSet) {
+      Instruction *insn = const_cast<Instruction*>(use->getInstruction());
+      const Opcode opcode = insn->getOpcode();
+
+      // load dst arg
+      LoadAddImm loadAddImm;
+      if (matchLoad(insn, NULL, NULL, 0, argID, loadAddImm)) {
+        tmpSeq.push_back(loadAddImm);
+        continue;
+      }
+
+      // add.ptr_type dst ptr other
+      if (opcode != OP_ADD) return false;
+      BinaryInstruction *add = cast<BinaryInstruction>(insn);
+      const Type addType = add->getType();
+      const RegisterFamily family = getFamily(addType);
+      if (family != unit.getPointerFamily()) return false;
+      if (addType == TYPE_FLOAT) return false;
+
+      // step 1 -> check that the other source comes from a load immediate
+      const uint32_t srcID = use->getSrcID();
+      const uint32_t otherID = srcID ^ 1;
+      const DefSet &defSet = dag->getDef(insn, otherID);
+      const uint32_t defNum = defSet.size();
+      if (defNum == 0 || defNum > 1) continue; // undefined or more than one def
+      const ValueDef *otherDef = *defSet.begin();
+      if (otherDef->getType() != ValueDef::DEF_INSN_DST) return false;
+      Instruction *otherInsn = const_cast<Instruction*>(otherDef->getInstruction());
+      if (otherInsn->getOpcode() != OP_LOADI) return false;
+      LoadImmInstruction *loadImm = cast<LoadImmInstruction>(otherInsn);
+      const Immediate imm = loadImm->getImmediate();
+      const uint64_t offset = getOffsetFromImm(imm);
+
+      // step 2 -> check that the results of the add are loads from private
+      // memory
+      const UseSet &addUseSet = dag->getUse(add, 0);
+      for (auto addUse : addUseSet) {
+        Instruction *insn = const_cast<Instruction*>(addUse->getInstruction());
+
+        // We finally find something like load dst arg+imm
+        LoadAddImm loadAddImm;
+        if (matchLoad(insn, add, loadImm, offset, argID, loadAddImm)) {
+          tmpSeq.push_back(loadAddImm);
+          continue;
+        }
+      }
+    }
+
+    // OK, the argument only need direct loads. We can now append all the
+    // direct load definitions we found
+    for (const auto &loadImmSeq : tmpSeq)
+      seq.push_back(loadImmSeq);
+    return true;
+  }
+
+  ArgUse FunctionArgumentLowerer::getArgUse(uint32_t argID)
+  {
+    FunctionArgument &arg = fn->getArg(argID);
+
+    // case 1 - we may store something to the structure argument
+    set<const Instruction*> visited;
+    if (this->useStore(ValueDef(&arg), visited))
+      return ARG_WRITTEN;
+
+    // case 2 - we look for the patterns: LOAD(ptr) or LOAD(ptr+imm)
+    if (this->matchLoadAddImm(argID))
+      return ARG_DIRECT_READ;
+
+    // case 3 - LOAD(ptr+runtime_value)
+    return ARG_INDIRECT_READ;
+  }
+
+  void FunctionArgumentLowerer::lower(uint32_t argID) {
+    IF_DEBUG(const ArgUse argUse = )this->getArgUse(argID);
+#if GBE_DEBUG
+    GBE_ASSERTM(argUse != ARG_WRITTEN,
+                "TODO A store to a structure argument "
+                "(i.e. not a char/short/int/float argument) has been found. "
+                "This is not supported yet");
+    GBE_ASSERTM(argUse != ARG_INDIRECT_READ,
+                "TODO Only direct loads of structure arguments are "
+                "supported now");
+#endif /* GBE_DEBUG */
+  }
+
+  void lowerFunctionArguments(Unit &unit, const std::string &functionName) {
+    FunctionArgumentLowerer lowerer(unit);
+    lowerer.lower(functionName);
+  }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/lowering.hpp b/backend/src/ir/lowering.hpp
new file mode 100644
index 0000000..ba0c87b
--- /dev/null
+++ b/backend/src/ir/lowering.hpp
@@ -0,0 +1,94 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file lowering.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *  Lower instructions that are not supported properly. Typical example is
+ *  handling returns or unsupported vector scatters / gathers
+ */
+
+#ifndef __GBE_IR_LOWERING_HPP__
+#define __GBE_IR_LOWERING_HPP__
+
+namespace gbe {
+namespace ir {
+
+  // Structure to update
+  class Unit;
+
+  /*! Remove all return instructions and replace them to forward branches that
+   *  point to the only return instruction in a dedicated basic block and the end
+   *  of the function.
+   *  Typically this code:
+   *
+   *  dst[x] = 1;
+   *  if (x > 4) return;
+   *  dst[x] = 3;
+   *
+   *  will be replaced by:
+   *
+   *  dst[x] = 1;
+   *  if (x > 4) goto end;
+   *  dst[x] = 3;
+   *  end:
+   *  return;
+   *
+   *  There will be only one return at the end of the function. This return will
+   *  be simply encoded as a End-of-thread instruction (EOT)
+   */
+  void lowerReturn(Unit &unit, const std::string &functionName);
+
+  /*! Function arguments are a bit tricky since we must implement the proper C
+   *  semantic: we can therefore address the function arguments as we want and
+   *  we can even modify them. This leads to interesting challenges. We identify
+   *  several cases:
+   *
+   *  case 1:
+   *  int f (__global int *dst, int x[16], int y) {
+   *    dst[get_global_id(0)] = x[16] + y;
+   *  }
+   *  Here x and y will be pushed to registers using the Curbe. No problem, we
+   *  can directly used the pushed registers
+   *
+   *  case 2:
+   *  int f (__global int *dst, int x[16], int y) {
+   *    dst[get_global_id(0)] = x[get_local_id(0)] + y;
+   *  }
+   *  Here x is indirectly accessed. We need to perform a gather from memory. We
+   *  can simply gather it from the curbe in memory
+   *
+   *  case 3:
+   *  int f (__global int *dst, int x[16], int y) {
+   *    x[get_local_id(0)] = y + 1;
+   *    int *ptr = get_local_id(0) % 2 ? x[0] : x[1];
+   *    dst[get_global_id(0)] = *ptr;
+   *  }
+   *  Here we modify the function argument since it is valid C. Problem is that
+   *  we are running in SIMD mode while the data are scalar (in both memory and
+   *  registers). In that case, we just spill everything to memory (using the
+   *  stack) and reload it from here when needed.
+   */
+  void lowerFunctionArguments(Unit &unit, const std::string &functionName);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_LOWERING_HPP__ */
+
diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp
new file mode 100644
index 0000000..9d60402
--- /dev/null
+++ b/backend/src/ir/printf.cpp
@@ -0,0 +1,222 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file printf.cpp
+ *
+ */
+
+#include <stdarg.h>
+#include "printf.hpp"
+
+namespace gbe
+{
+  namespace ir
+  {
+
+    pthread_mutex_t PrintfSet::lock = PTHREAD_MUTEX_INITIALIZER;
+
+    uint32_t PrintfSet::append(PrintfFmt* fmt, Unit& unit)
+    {
+      fmts.push_back(*fmt);
+
+      for (auto &f : fmts.back()) {
+        if (f.type == PRINTF_SLOT_TYPE_STRING)
+          continue;
+
+        slots.push_back(&f);
+      }
+
+      /* Update the total size of size. */
+      if (slots.size() > 0)
+        sizeOfSize = slots.back()->state->out_buf_sizeof_offset
+                     + getPrintfBufferElementSize(slots.size() - 1);
+
+      return (uint32_t)fmts.size();
+    }
+
+    static void generatePrintfFmtString(PrintfState& state, std::string& str)
+    {
+      char num_str[16];
+      str += "%";
+
+      if (state.left_justified) {
+        str += "-";
+      }
+
+      if (state.sign_symbol == 1) {
+        str += "+";
+      } else if (state.sign_symbol == 2) {
+        str += " ";
+      }
+
+      if (state.alter_form) {
+        str += "#";
+      }
+
+      if (state.zero_padding) {
+        str += "0";
+      }
+
+      if (state.min_width >= 0) {
+        snprintf(num_str, 16, "%d", state.min_width);
+        str += num_str;
+      }
+
+      if (state.precision >= 0) {
+        str += ".";
+        snprintf(num_str, 16, "%d", state.precision);
+        str += num_str;
+      }
+
+      switch (state.length_modifier) {
+        case PRINTF_LM_HH:
+          str += "hh";
+          break;
+        case PRINTF_LM_H:
+          str += "h";
+          break;
+        case PRINTF_LM_L:
+          str += "l";
+          break;
+        case PRINTF_LM_HL:
+          str += "";
+          break;
+        default:
+          assert(state.length_modifier == PRINTF_LM_NONE);
+      }
+    }
+
+#define PRINT_SOMETHING(target_ty, conv)  do {                          \
+      if (!vec_i)                                                       \
+        pf_str = pf_str + std::string(#conv);                           \
+      printf(pf_str.c_str(),                                            \
+             ((target_ty *)((char *)buf_addr + slot.state->out_buf_sizeof_offset * \
+                            global_wk_sz0 * global_wk_sz1 * global_wk_sz2)) \
+             [(k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i) * vec_num + vec_i]);\
+    } while (0)
+
+
+    void PrintfSet::outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
+                                 size_t global_wk_sz1, size_t global_wk_sz2)
+    {
+      LockOutput lock;
+      size_t i, j, k;
+      std::string pf_str;
+      int stmt = 0;
+
+      for (auto &pf : fmts) {
+        for (i = 0; i < global_wk_sz0; i++) {
+          for (j = 0; j < global_wk_sz1; j++) {
+            for (k = 0; k < global_wk_sz2; k++) {
+
+              int flag = ((int *)index_addr)[stmt*global_wk_sz0*global_wk_sz1*global_wk_sz2
+                                             + k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i];
+              if (flag) {
+                for (auto &slot : pf) {
+                  pf_str = "";
+                  int vec_num;
+
+                  if (slot.type == PRINTF_SLOT_TYPE_STRING) {
+                    printf("%s", slot.str);
+                    continue;
+                  }
+                  assert(slot.type == PRINTF_SLOT_TYPE_STATE);
+
+                  generatePrintfFmtString(*slot.state, pf_str);
+
+                  vec_num = slot.state->vector_n > 0 ? slot.state->vector_n : 1;
+
+                  for (int vec_i = 0; vec_i < vec_num; vec_i++) {
+                    if (vec_i)
+                      printf(",");
+
+                    switch (slot.state->conversion_specifier) {
+                      case PRINTF_CONVERSION_D:
+                      case PRINTF_CONVERSION_I:
+                        PRINT_SOMETHING(int, d);
+                        break;
+
+                      case PRINTF_CONVERSION_O:
+                        PRINT_SOMETHING(int, o);
+                        break;
+                      case PRINTF_CONVERSION_U:
+                        PRINT_SOMETHING(int, u);
+                        break;
+                      case PRINTF_CONVERSION_X:
+                        PRINT_SOMETHING(int, X);
+                        break;
+                      case PRINTF_CONVERSION_x:
+                        PRINT_SOMETHING(int, x);
+                        break;
+
+                      case PRINTF_CONVERSION_C:
+                        PRINT_SOMETHING(char, c);
+                        break;
+
+                      case PRINTF_CONVERSION_F:
+                        PRINT_SOMETHING(float, F);
+                        break;
+                      case PRINTF_CONVERSION_f:
+                        PRINT_SOMETHING(float, f);
+                        break;
+                      case PRINTF_CONVERSION_E:
+                        PRINT_SOMETHING(float, E);
+                        break;
+                      case PRINTF_CONVERSION_e:
+                        PRINT_SOMETHING(float, e);
+                        break;
+                      case PRINTF_CONVERSION_G:
+                        PRINT_SOMETHING(float, G);
+                        break;
+                      case PRINTF_CONVERSION_g:
+                        PRINT_SOMETHING(float, g);
+                        break;
+                      case PRINTF_CONVERSION_A:
+                        PRINT_SOMETHING(float, A);
+                        break;
+                      case PRINTF_CONVERSION_a:
+                        PRINT_SOMETHING(float, a);
+                        break;
+                      case PRINTF_CONVERSION_P:
+                        PRINT_SOMETHING(int, p);
+                        break;
+
+                      case PRINTF_CONVERSION_S:
+                        pf_str = pf_str + "s";
+                        printf(pf_str.c_str(), slot.state->str.c_str());
+                        break;
+
+                      default:
+                        assert(0);
+                        return;
+                    }
+                  }
+
+                  pf_str = "";
+                }
+              }
+            }
+          }
+        }
+        stmt++;
+      }
+    }
+  } /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/printf.hpp b/backend/src/ir/printf.hpp
new file mode 100644
index 0000000..4db7245
--- /dev/null
+++ b/backend/src/ir/printf.hpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file printf.hpp
+ *
+ */
+#ifndef __GBE_IR_PRINTF_HPP__
+#define __GBE_IR_PRINTF_HPP__
+
+#include <string.h>
+#include "sys/map.hpp"
+#include "sys/vector.hpp"
+#include "unit.hpp"
+
+namespace gbe
+{
+  namespace ir
+  {
+
+    /* Things about printf info. */
+    enum {
+      PRINTF_LM_NONE,
+      PRINTF_LM_HH,
+      PRINTF_LM_H,
+      PRINTF_LM_L,
+      PRINTF_LM_HL,
+    };
+
+    enum {
+      PRINTF_CONVERSION_INVALID,
+      PRINTF_CONVERSION_D,
+      PRINTF_CONVERSION_I,
+      PRINTF_CONVERSION_O,
+      PRINTF_CONVERSION_U,
+      PRINTF_CONVERSION_X,
+      PRINTF_CONVERSION_x,
+      PRINTF_CONVERSION_F,
+      PRINTF_CONVERSION_f,
+      PRINTF_CONVERSION_E,
+      PRINTF_CONVERSION_e,
+      PRINTF_CONVERSION_G,
+      PRINTF_CONVERSION_g,
+      PRINTF_CONVERSION_A,
+      PRINTF_CONVERSION_a,
+      PRINTF_CONVERSION_C,
+      PRINTF_CONVERSION_S,
+      PRINTF_CONVERSION_P
+    };
+
+    struct PrintfState {
+      char left_justified;
+      char sign_symbol; //0 for nothing, 1 for sign, 2 for space.
+      char alter_form;
+      char zero_padding;
+      char vector_n;
+      int min_width;
+      int precision;
+      int length_modifier;
+      char conversion_specifier;
+      int out_buf_sizeof_offset;  // Should *global_total_size to get the full offset.
+      std::string str;            //if %s, the string store here.
+    };
+
+    enum {
+      PRINTF_SLOT_TYPE_NONE,
+      PRINTF_SLOT_TYPE_STRING,
+      PRINTF_SLOT_TYPE_STATE
+    };
+
+    struct PrintfSlot {
+      int type;
+      union {
+        char* str;
+        PrintfState* state;
+        void *ptr;
+      };
+
+      PrintfSlot(void) {
+        type = PRINTF_SLOT_TYPE_NONE;
+        ptr = NULL;
+      }
+
+      PrintfSlot(const char * s) {
+        type = PRINTF_SLOT_TYPE_STRING;
+        int len = strlen(s);
+        str = (char*)malloc((len + 1) * sizeof(char));
+        memcpy(str, s, (len + 1) * sizeof(char));
+        str[len] = 0;
+      }
+
+      PrintfSlot(PrintfState * st) {
+        type = PRINTF_SLOT_TYPE_STATE;
+        state = (PrintfState *)malloc(sizeof(PrintfState));
+        memcpy(state, st, sizeof(PrintfState));
+      }
+
+      PrintfSlot(const PrintfSlot & other) {
+        if (other.type == PRINTF_SLOT_TYPE_STRING) {
+          int len = strlen(other.str);
+          str = (char*)malloc((len + 1) * sizeof(char));
+          memcpy(str, other.str, (len + 1) * sizeof(char));
+          str[len] = 0;
+          type = PRINTF_SLOT_TYPE_STRING;
+        } else if (other.type == PRINTF_SLOT_TYPE_STATE) {
+          type = PRINTF_SLOT_TYPE_STATE;
+          state = (PrintfState *)malloc(sizeof(PrintfState));
+          memcpy(state, other.state, sizeof(PrintfState));
+        } else {
+          type = PRINTF_SLOT_TYPE_NONE;
+          ptr = NULL;
+        }
+      }
+
+      PrintfSlot(PrintfSlot && other) {
+        void *p = other.ptr;
+        type = other.type;
+        other.ptr = ptr;
+        ptr = p;
+      }
+
+      ~PrintfSlot(void) {
+        if (ptr)
+          free(ptr);
+      }
+    };
+
+    class Context;
+
+    class PrintfSet //: public Serializable
+    {
+    public:
+      PrintfSet(const PrintfSet& other) {
+        for (auto &f : other.fmts) {
+          fmts.push_back(f);
+        }
+
+        for (auto &s : other.slots) {
+          slots.push_back(s);
+        }
+
+        sizeOfSize = other.sizeOfSize;
+        btiBuf = other.btiBuf;
+        btiIndexBuf = other.btiIndexBuf;
+      }
+
+      PrintfSet(void) = default;
+
+      struct LockOutput {
+        LockOutput(void) {
+          pthread_mutex_lock(&lock);
+        }
+
+        ~LockOutput(void) {
+          pthread_mutex_unlock(&lock);
+        }
+      };
+
+      typedef vector<PrintfSlot> PrintfFmt;
+      uint32_t append(PrintfFmt* fmt, Unit &unit);
+
+      uint32_t getPrintfNum(void) const {
+        return fmts.size();
+      }
+
+      uint32_t getPrintfSizeOfSize(void) const {
+        return sizeOfSize;
+      }
+
+      void setBufBTI(uint8_t b)      { btiBuf = b; }
+      void setIndexBufBTI(uint8_t b) { btiIndexBuf = b; }
+      uint8_t getBufBTI() const      { return btiBuf; }
+      uint8_t getIndexBufBTI() const { return btiIndexBuf; }
+
+      uint32_t getPrintfBufferElementSize(uint32_t i) {
+        PrintfSlot* slot = slots[i];
+        int vec_num = 1;
+        if (slot->state->vector_n > 0) {
+          vec_num = slot->state->vector_n;
+        }
+
+        assert(vec_num > 0 && vec_num <= 16);
+
+        switch (slot->state->conversion_specifier) {
+          case PRINTF_CONVERSION_I:
+          case PRINTF_CONVERSION_D:
+          case PRINTF_CONVERSION_O:
+          case PRINTF_CONVERSION_U:
+          case PRINTF_CONVERSION_X:
+          case PRINTF_CONVERSION_x:
+          case PRINTF_CONVERSION_P:
+          /* Char will be aligned to sizeof(int) here. */
+          case PRINTF_CONVERSION_C:
+            return (uint32_t)(sizeof(int) * vec_num);
+          case PRINTF_CONVERSION_E:
+          case PRINTF_CONVERSION_e:
+          case PRINTF_CONVERSION_F:
+          case PRINTF_CONVERSION_f:
+          case PRINTF_CONVERSION_G:
+          case PRINTF_CONVERSION_g:
+          case PRINTF_CONVERSION_A:
+          case PRINTF_CONVERSION_a:
+            return (uint32_t)(sizeof(float) * vec_num);
+          case PRINTF_CONVERSION_S:
+            return (uint32_t)0;
+          default:
+            break;
+        }
+        assert(0);
+        return 0;
+      }
+
+      void outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
+                        size_t global_wk_sz1, size_t global_wk_sz2);
+
+    private:
+      vector<PrintfFmt> fmts;
+      vector<PrintfSlot*> slots;
+      uint32_t sizeOfSize; // Total sizeof size.
+      friend struct LockOutput;
+      uint8_t btiBuf;
+      uint8_t btiIndexBuf;
+      static pthread_mutex_t lock;
+      GBE_CLASS(PrintfSet);
+    };
+  } /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_PRINTF_HPP__ */
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
new file mode 100644
index 0000000..fc69367
--- /dev/null
+++ b/backend/src/ir/profile.cpp
@@ -0,0 +1,106 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file profile.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/profile.hpp"
+#include "ir/function.hpp"
+#include "sys/platform.hpp"
+
+namespace gbe {
+namespace ir {
+
+  namespace ocl
+  {
+    const char *specialRegMean[] = {
+        "local_id_0", "local_id_1", "local_id_2",
+        "group_id_0", "group_id_1", "group_id_2",
+        "num_groups_0", "num_groups_1", "num_groups_2",
+        "local_size_0", "local_size_1", "local_size_2",
+        "global_size_0", "global_size_1", "global_size_2",
+        "global_offset_0", "global_offset_1", "global_offset_2",
+        "stack_pointer", "stack_buffer",
+        "block_ip",
+        "barrier_id", "thread_number", "work_dimension",
+        "zero", "one",
+        "retVal", "slm_offset",
+        "printf_buffer_pointer", "printf_index_buffer_pointer",
+        "invalid"
+    };
+
+#if GBE_DEBUG
+#define DECL_NEW_REG(FAMILY, REG, UNIFORM) \
+   r = fn.newRegister(FAMILY_DWORD, UNIFORM); \
+   GBE_ASSERT(r == REG);
+#else
+#define DECL_NEW_REG(FAMILY, REG, UNIFORM) \
+   fn.newRegister(FAMILY_DWORD, UNIFORM);
+#endif /* GBE_DEBUG */
+    static void init(Function &fn) {
+      IF_DEBUG(Register r);
+      DECL_NEW_REG(FAMILY_DWORD, lid0, 0);
+      DECL_NEW_REG(FAMILY_DWORD, lid1, 0);
+      DECL_NEW_REG(FAMILY_DWORD, lid2, 0);
+      DECL_NEW_REG(FAMILY_DWORD, groupid0, 1);
+      DECL_NEW_REG(FAMILY_DWORD, groupid1, 1);
+      DECL_NEW_REG(FAMILY_DWORD, groupid2, 1);
+      DECL_NEW_REG(FAMILY_DWORD, numgroup0, 1);
+      DECL_NEW_REG(FAMILY_DWORD, numgroup1, 1);
+      DECL_NEW_REG(FAMILY_DWORD, numgroup2, 1);
+      DECL_NEW_REG(FAMILY_DWORD, lsize0, 1);
+      DECL_NEW_REG(FAMILY_DWORD, lsize1, 1);
+      DECL_NEW_REG(FAMILY_DWORD, lsize2, 1);
+      DECL_NEW_REG(FAMILY_DWORD, gsize0, 1);
+      DECL_NEW_REG(FAMILY_DWORD, gsize1, 1);
+      DECL_NEW_REG(FAMILY_DWORD, gsize2, 1);
+      DECL_NEW_REG(FAMILY_DWORD, goffset0, 1);
+      DECL_NEW_REG(FAMILY_DWORD, goffset1, 1);
+      DECL_NEW_REG(FAMILY_DWORD, goffset2, 1);
+      DECL_NEW_REG(FAMILY_DWORD, stackptr, 0);
+      DECL_NEW_REG(FAMILY_DWORD, stackbuffer, 1);
+      DECL_NEW_REG(FAMILY_WORD,  blockip, 0);
+      DECL_NEW_REG(FAMILY_DWORD, barrierid, 1);
+      DECL_NEW_REG(FAMILY_DWORD, threadn, 1);
+      DECL_NEW_REG(FAMILY_DWORD, workdim, 1);
+      DECL_NEW_REG(FAMILY_DWORD, zero, 1);
+      DECL_NEW_REG(FAMILY_DWORD, one, 1);
+      DECL_NEW_REG(FAMILY_WORD, retVal, 1);
+      DECL_NEW_REG(FAMILY_WORD, slmoffset, 1);
+      DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1);
+      DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1);
+      DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
+    }
+#undef DECL_NEW_REG
+
+  } /* namespace ocl */
+
+  void initProfile(Function &fn) {
+    const Profile profile = fn.getProfile();
+    switch (profile) {
+      case PROFILE_C: GBE_ASSERTM(false, "Unsupported profile"); break;
+      case PROFILE_OCL: ocl::init(fn);
+    };
+  }
+
+} /* namespace ir */
+} /* namespace gbe */
+
+
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
new file mode 100644
index 0000000..4e89bdd
--- /dev/null
+++ b/backend/src/ir/profile.hpp
@@ -0,0 +1,86 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file profile.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_PROFILE_HPP__
+#define __GBE_IR_PROFILE_HPP__
+
+#include "ir/register.hpp"
+
+namespace gbe {
+namespace ir {
+
+  /*! Profile is defined *per-function* and mostly predefined registers */
+  enum Profile : uint32_t {
+    PROFILE_C = 0,  // Not used now
+    PROFILE_OCL = 1
+  };
+
+  // Will be pre-initialized based on its profile
+  class Function;
+
+  /*! Registers used for ocl */
+  namespace ocl
+  {
+    static const Register lid0 = Register(0);      // get_local_id(0)
+    static const Register lid1 = Register(1);      // get_local_id(1)
+    static const Register lid2 = Register(2);      // get_local_id(2)
+    static const Register groupid0 = Register(3);  // get_group_id(0)
+    static const Register groupid1 = Register(4);  // get_group_id(1)
+    static const Register groupid2 = Register(5);  // get_group_id(2)
+    static const Register numgroup0 = Register(6); // get_num_groups(0)
+    static const Register numgroup1 = Register(7); // get_num_groups(1)
+    static const Register numgroup2 = Register(8); // get_num_groups(2)
+    static const Register lsize0 = Register(9);    // get_local_size(0)
+    static const Register lsize1 = Register(10);   // get_local_size(1)
+    static const Register lsize2 = Register(11);   // get_local_size(2)
+    static const Register gsize0 = Register(12);   // get_global_size(0)
+    static const Register gsize1 = Register(13);   // get_global_size(1)
+    static const Register gsize2 = Register(14);   // get_global_size(2)
+    static const Register goffset0 = Register(15); // get_global_offset(0)
+    static const Register goffset1 = Register(16); // get_global_offset(1)
+    static const Register goffset2 = Register(17); // get_global_offset(2)
+    static const Register stackptr = Register(18); // stack pointer
+    static const Register stackbuffer = Register(19); // stack buffer base address.
+    static const Register blockip = Register(20);  // blockip
+    static const Register barrierid = Register(21);// barrierid
+    static const Register threadn = Register(22);  // number of threads
+    static const Register workdim = Register(23);  // work dimention.
+    static const Register zero = Register(24);     //  scalar register holds zero.
+    static const Register one = Register(25);     //  scalar register holds one. 
+    static const Register retVal = Register(26);   // helper register to do data flow analysis.
+    static const Register slmoffset = Register(27);  // Group's SLM offset in total 64K SLM
+    static const Register printfbptr = Register(28); // printf buffer address .
+    static const Register printfiptr = Register(29); // printf index buffer address.
+    static const Register invalid = Register(30);  // used for valid comparation.
+    static const uint32_t regNum = 31;             // number of special registers
+    extern const char *specialRegMean[];           // special register name.
+  } /* namespace ocl */
+
+  /*! Initialize the profile of the given function */
+  void initProfile(Function &fn);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_PROFILE_HPP__ */
+
diff --git a/backend/src/ir/register.cpp b/backend/src/ir/register.cpp
new file mode 100644
index 0000000..471bfbd
--- /dev/null
+++ b/backend/src/ir/register.cpp
@@ -0,0 +1,67 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file register.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/profile.hpp"
+#include "ir/register.hpp"
+
+namespace gbe {
+namespace ir {
+
+  std::ostream &operator<< (std::ostream &out, const RegisterData &regData)
+  {
+    switch (regData.family) {
+      case FAMILY_BOOL: return out << "bool";
+      case FAMILY_BYTE: return out << "byte";
+      case FAMILY_WORD: return out << "word";
+      case FAMILY_DWORD: return out << "dword";
+      case FAMILY_QWORD: return out << "qword";
+    };
+    return out;
+  }
+
+  std::ostream &operator<< (std::ostream &out, const RegisterFile &file)
+  {
+    out << "## " << file.regNum() << " register"
+        << (file.regNum() ? "s" : "") << " ##" << std::endl;
+    for (uint32_t i = 0; i < file.regNum(); ++i) {
+      const RegisterData reg = file.get(Register(i));
+      out << ".decl." << reg << " %" << i;
+      if (i < ocl::regNum)
+        out << " " << ocl::specialRegMean[i];
+      out << std::endl;
+    }
+    return out;
+  }
+
+  Tuple RegisterFile::appendArrayTuple(const Register *reg, uint32_t regNum) {
+    const Tuple index = Tuple(regTuples.size());
+    for (uint32_t regID = 0; regID < regNum; ++regID) {
+      GBE_ASSERTM(reg[regID] < this->regNum(), "Out-of-bound register");
+      regTuples.push_back(reg[regID]);
+    }
+    return index;
+  }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/register.hpp b/backend/src/ir/register.hpp
new file mode 100644
index 0000000..7bd4f6e
--- /dev/null
+++ b/backend/src/ir/register.hpp
@@ -0,0 +1,170 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file register.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_REGISTER_HPP__
+#define __GBE_IR_REGISTER_HPP__
+
+#include "sys/vector.hpp"
+#include "sys/platform.hpp"
+
+namespace gbe {
+namespace ir {
+
+  /*! Defines the size of the pointers. All the functions from the unit will
+   *  use the same pointer size as the unit they belong to
+   */
+  enum PointerSize {
+    POINTER_32_BITS = 32,
+    POINTER_64_BITS = 64
+  };
+
+  /*! Basically provides the size of the register */
+  enum RegisterFamily : uint8_t {
+    FAMILY_BOOL  = 0,
+    FAMILY_BYTE  = 1,
+    FAMILY_WORD  = 2,
+    FAMILY_DWORD = 3,
+    FAMILY_QWORD = 4
+  };
+
+  INLINE char getFamilyName(RegisterFamily family) {
+    static char registerFamilyName[] = {'b', 'B', 'W', 'D', 'Q'};
+    return registerFamilyName[family];
+  }
+
+  INLINE uint32_t getFamilySize(RegisterFamily family) {
+    switch (family) {
+      case FAMILY_BYTE: return 1;
+      case FAMILY_WORD: return 2;
+      case FAMILY_DWORD: return 4;
+      case FAMILY_QWORD: return 8;
+      default: NOT_SUPPORTED;
+    };
+    return 0;
+  }
+
+  /*! A register can be either a byte, a word, a dword or a qword. We store this
+   *  value into a register data (which makes the register file) 
+   */
+  class RegisterData
+  {
+  public:
+    /*! Build a register. All fields will be immutable */
+    INLINE RegisterData(RegisterFamily family,
+                        bool uniform = false) : family(family), uniform(uniform) {}
+    /*! Copy constructor */
+    INLINE RegisterData(const RegisterData &other) : family(other.family), uniform(other.uniform) {}
+    /*! Copy operator */
+    INLINE RegisterData &operator= (const RegisterData &other) {
+      this->family = other.family;
+      this->uniform = other.uniform;
+      return *this;
+    }
+    /*! Nothing really happens here */
+    INLINE ~RegisterData(void) {}
+    RegisterFamily family;            //!< Register size or if it is a flag
+    INLINE const bool isUniform() const { return uniform; }
+    INLINE void setUniform(bool uni) { uniform = uni; }
+  private:
+    bool uniform;
+    GBE_CLASS(RegisterData);
+  };
+
+  /*! Output the register file string in the given stream */
+  std::ostream &operator<< (std::ostream &out, const RegisterData &regData);
+
+  /*! Register is the position of the index of the register data in the register
+   *  file. We enforce type safety with this class
+   */
+  TYPE_SAFE(Register, uint16_t)
+  INLINE bool operator< (const Register &r0, const Register &r1) {
+    return r0.value() < r1.value();
+  }
+
+  /*! Tuple is the position of the first register in the tuple vector. We
+   *  enforce type safety with this class
+   */
+  TYPE_SAFE(Tuple, uint16_t)
+
+  /*! A register file allocates and destroys registers. Basically, we will have
+   *  one register file per function
+   */
+  class RegisterFile
+  {
+  public:
+    /*! Return the index of a newly allocated register */
+    INLINE Register append(RegisterFamily family, bool uniform = false) {
+      GBE_ASSERTM(regNum() < MAX_INDEX,
+                  "Too many defined registers (only 65535 are supported)");
+      const uint16_t index = regNum();
+      const RegisterData reg(family, uniform);
+      regs.push_back(reg);
+      return Register(index);
+    }
+    /*! Make a tuple from an array of register */
+    Tuple appendArrayTuple(const Register *reg, uint32_t regNum);
+    /*! Make a tuple and return the index to the first element of the tuple */
+    template <typename First, typename... Rest>
+    INLINE Tuple appendTuple(First first, Rest... rest) {
+      const Tuple index = Tuple(regTuples.size());
+      GBE_ASSERTM(first < regNum(), "Out-of-bound register");
+      regTuples.push_back(first);
+      appendTuple(rest...);
+      return index;
+    }
+    /*! To terminate variadic recursion */
+    INLINE void appendTuple(void) {}
+    /*! Return a copy of the register at index */
+    INLINE RegisterData get(Register index) const { return regs[index]; }
+    /*! Return true if the specified register is uniform type. */
+    INLINE bool isUniform(Register index) { return regs[index].isUniform(); }
+    /*! Set a register to uniform or varying data type*/
+    INLINE void setUniform(Register index, bool uniform) { regs[index].setUniform(uniform); }
+    /*! Get the register index from the tuple */
+    INLINE Register get(Tuple index, uint32_t which) const {
+      return regTuples[uint16_t(index) + which];
+    }
+    /*! Set the register index from the tuple */
+    INLINE void set(Tuple index, uint32_t which, Register reg) {
+      regTuples[uint16_t(index) + which] = reg;
+    }
+    /*! Number of registers in the register file */
+    INLINE uint32_t regNum(void) const { return regs.size(); }
+    /*! Number of tuples in the register file */
+    INLINE uint32_t tupleNum(void) const { return regTuples.size(); }
+    /*! register and tuple indices are short */
+    enum { MAX_INDEX = 0xffff }; 
+  private:
+    vector<RegisterData> regs;   //!< All the registers together
+    vector<Register> regTuples;  //!< Tuples are used for many src / dst
+    GBE_CLASS(RegisterFile);
+  };
+
+  /*! Output the register file string in the given stream */
+  std::ostream &operator<< (std::ostream &out, const RegisterFile &file);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_REGISTER_HPP__ */
+
diff --git a/backend/src/ir/sampler.cpp b/backend/src/ir/sampler.cpp
new file mode 100644
index 0000000..7e8355f
--- /dev/null
+++ b/backend/src/ir/sampler.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file sampler.cpp
+ *
+ */
+#include "sampler.hpp"
+#include "context.hpp"
+#include "ocl_common_defines.h"
+
+namespace gbe {
+namespace ir {
+
+#ifdef GBE_COMPILER_AVAILABLE
+  uint8_t SamplerSet::appendReg(uint32_t key, Context *ctx) {
+    uint8_t samplerSlot = samplerMap.size();
+    samplerMap.insert(std::make_pair(key, samplerSlot));
+    return samplerSlot;
+  }
+
+  uint8_t SamplerSet::append(uint32_t samplerValue, Context *ctx)
+  {
+    auto it = samplerMap.find(samplerValue);
+    if (it != samplerMap.end())
+        return it->second;
+    // This register is just used as a key.
+    return appendReg(samplerValue, ctx);
+  }
+
+#define SAMPLER_ID(id) ((id << __CLK_SAMPLER_ARG_BASE) | __CLK_SAMPLER_ARG_KEY_BIT)
+  uint8_t SamplerSet::append(Register samplerReg, Context *ctx)
+  {
+    ir::FunctionArgument *arg =  ctx->getFunction().getArg(samplerReg);
+    GBE_ASSERT(arg != NULL);
+
+    // XXX As LLVM 3.2/3.1 doesn't have a new data type for the sampler_t, we have to fix up the argument
+    // type here. Once we switch to the LLVM and use the new data type sampler_t, we can remove this
+    // work around.
+    arg->type = ir::FunctionArgument::SAMPLER;
+    arg->info.typeName = "sampler_t";
+    int32_t id = ctx->getFunction().getArgID(arg);
+    GBE_ASSERT(id < (1 << __CLK_SAMPLER_ARG_BITS));
+
+    auto it = samplerMap.find(SAMPLER_ID(id));
+    if (it != samplerMap.end()) {
+      return it->second;
+    }
+    return appendReg(SAMPLER_ID(id), ctx);
+  }
+#endif
+
+#define OUT_UPDATE_SZ(elt) SERIALIZE_OUT(elt, outs, ret_size)
+#define IN_UPDATE_SZ(elt) DESERIALIZE_IN(elt, ins, total_size)
+
+  /*! Implements the serialization. */
+  size_t SamplerSet::serializeToBin(std::ostream& outs) {
+    size_t ret_size = 0;
+
+    OUT_UPDATE_SZ(magic_begin);
+
+    OUT_UPDATE_SZ(samplerMap.size());
+    for (auto iter : samplerMap) {
+      OUT_UPDATE_SZ(iter.first);
+      OUT_UPDATE_SZ(iter.second);
+    }
+
+    OUT_UPDATE_SZ(magic_end);
+    OUT_UPDATE_SZ(ret_size);
+
+    return ret_size;
+  }
+
+  size_t SamplerSet::deserializeFromBin(std::istream& ins) {
+    size_t total_size = 0;
+    uint32_t magic;
+    size_t sampler_map_sz = 0;
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_begin)
+      return 0;
+
+    IN_UPDATE_SZ(sampler_map_sz);
+    for (size_t i = 0; i < sampler_map_sz; i++) {
+      uint32_t key;
+      uint32_t slot;
+
+      IN_UPDATE_SZ(key);
+      IN_UPDATE_SZ(slot);
+      samplerMap.insert(std::make_pair(key, slot));
+    }
+
+    IN_UPDATE_SZ(magic);
+    if (magic != magic_end)
+      return 0;
+
+    size_t total_bytes;
+    IN_UPDATE_SZ(total_bytes);
+    if (total_bytes + sizeof(total_size) != total_size)
+      return 0;
+
+    return total_size;
+  }
+
+  void SamplerSet::printStatus(int indent, std::ostream& outs) {
+    using namespace std;
+    string spaces = indent_to_str(indent);
+    string spaces_nl = indent_to_str(indent + 4);
+
+    outs << spaces << "------------ Begin SamplerSet ------------" << "\n";
+
+    outs << spaces_nl << "  SamplerSet Map: [index, sampler_reg, sampler_slot]\n";
+    outs << spaces_nl << "     samplerMap size: " << samplerMap.size() << "\n";
+
+    for (auto iter : samplerMap) {
+      outs << spaces_nl <<  "     [" << iter.first << ", "
+           << iter.second << "]\n";
+    }
+
+    outs << spaces << "------------- End SamplerSet -------------" << "\n";
+  }
+
+} /* namespace ir */
+} /* namespace gbe */
diff --git a/backend/src/ir/sampler.hpp b/backend/src/ir/sampler.hpp
new file mode 100644
index 0000000..2b51ce3
--- /dev/null
+++ b/backend/src/ir/sampler.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file sampler.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_SAMPLER_HPP__
+#define __GBE_IR_SAMPLER_HPP__
+
+#include "ir/register.hpp"
+#include "sys/map.hpp"
+
+namespace gbe {
+namespace ir {
+
+  /*! A sampler set is a set of global samplers which are defined as constant global
+   * sampler or defined in the outermost kernel scope variables. According to the spec
+   * all the variable should have a initialized integer value and can't be modified.
+   */
+  class Context;
+
+  class SamplerSet : public Serializable
+  {
+  public:
+    /*! Append the specified sampler and return the allocated offset.
+     *  If the speficied sampler is exist, only return the previous offset and
+     *  don't append it again. Return -1, if failed.*/
+    uint8_t append(uint32_t clkSamplerValue, Context *ctx);
+    /*! Append a sampler defined in kernel args. */
+    uint8_t append(Register samplerArg, Context *ctx);
+    size_t getDataSize(void) { return samplerMap.size(); }
+    size_t getDataSize(void) const { return samplerMap.size(); }
+    void getData(uint32_t *samplers) const {
+      for(auto &it : samplerMap)
+        samplers[it.second] = it.first;
+    }
+
+    void operator = (const SamplerSet& other) {
+      samplerMap.insert(other.samplerMap.begin(), other.samplerMap.end());
+    }
+
+    bool empty() const { return samplerMap.empty(); }
+
+    SamplerSet(const SamplerSet& other) : samplerMap(other.samplerMap.begin(), other.samplerMap.end()) { }
+    SamplerSet() {}
+
+    static const uint32_t magic_begin = TO_MAGIC('S', 'A', 'M', 'P');
+    static const uint32_t magic_end = TO_MAGIC('P', 'M', 'A', 'S');
+
+    /* format:
+       magic_begin     |
+       samplerMap_size |
+       element_1       |
+       ........        |
+       element_n       |
+       regMap_size     |
+       element_1       |
+       ........        |
+       element_n       |
+       magic_end       |
+       total_size
+    */
+
+    /*! Implements the serialization. */
+    virtual size_t serializeToBin(std::ostream& outs);
+    virtual size_t deserializeFromBin(std::istream& ins);
+    virtual void printStatus(int indent, std::ostream& outs);
+
+  private:
+    uint8_t appendReg(uint32_t key, Context *ctx);
+    map<uint32_t, uint32_t> samplerMap;
+    GBE_CLASS(SamplerSet);
+  };
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_SAMPLER_HPP__ */
diff --git a/backend/src/ir/type.cpp b/backend/src/ir/type.cpp
new file mode 100644
index 0000000..56f5c12
--- /dev/null
+++ b/backend/src/ir/type.cpp
@@ -0,0 +1,51 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file instruction.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "ir/type.hpp"
+
+namespace gbe {
+namespace ir {
+  std::ostream &operator<< (std::ostream &out, const Type &type) {
+    switch (type) {
+      case TYPE_BOOL: return out << "bool";
+      case TYPE_S8: return out << "int8";
+      case TYPE_U8: return out << "uint8";
+      case TYPE_S16: return out << "int16";
+      case TYPE_U16: return out << "uin16";
+      case TYPE_S32: return out << "int32";
+      case TYPE_U32: return out << "uin32";
+      case TYPE_S64: return out << "int64";
+      case TYPE_U64: return out << "uin64";
+      case TYPE_HALF: return out << "half";
+      case TYPE_FLOAT: return out << "float";
+      case TYPE_DOUBLE: return out << "double";
+      default :
+        GBE_ASSERT(0 && "Unsupported type\n");
+    };
+    return out;
+  }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/type.hpp b/backend/src/ir/type.hpp
new file mode 100644
index 0000000..8bfbdc8
--- /dev/null
+++ b/backend/src/ir/type.hpp
@@ -0,0 +1,97 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file type.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_TYPE_HPP__
+#define __GBE_IR_TYPE_HPP__
+
+#include "sys/platform.hpp"
+#include "ir/register.hpp"
+
+#include <ostream>
+
+namespace gbe {
+namespace ir {
+
+  /*! All types possibly supported by the instruction */
+  enum Type : uint8_t {
+    TYPE_BOOL = 0, //!< boolean value
+    TYPE_S8,       //!< signed 8 bits integer
+    TYPE_U8,       //!< unsigned 8 bits integer
+    TYPE_S16,      //!< signed 16 bits integer
+    TYPE_U16,      //!< unsigned 16 bits integer
+    TYPE_S32,      //!< signed 32 bits integer
+    TYPE_U32,      //!< unsigned 32 bits integer
+    TYPE_S64,      //!< signed 64 bits integer
+    TYPE_U64,      //!< unsigned 64 bits integer
+    TYPE_HALF,     //!< 16 bits floating point value
+    TYPE_FLOAT,    //!< 32 bits floating point value
+    TYPE_DOUBLE,   //!< 64 bits floating point value
+    TYPE_LARGE_INT //!< integer larger than 64 bits.
+  };
+
+  /*! Output a string for the type in the given stream */
+  std::ostream &operator<< (std::ostream &out, const Type &type);
+
+  /*! Get the register family for each type */
+  INLINE RegisterFamily getFamily(Type type) {
+    switch (type) {
+      case TYPE_BOOL:
+        return FAMILY_BOOL;
+      case TYPE_S8:
+      case TYPE_U8:
+        return FAMILY_BYTE;
+      case TYPE_S16:
+      case TYPE_U16:
+      case TYPE_HALF:
+        return FAMILY_WORD;
+      case TYPE_S32:
+      case TYPE_U32:
+      case TYPE_FLOAT:
+        return FAMILY_DWORD;
+      case TYPE_S64:
+      case TYPE_U64:
+      case TYPE_DOUBLE:
+        return FAMILY_QWORD;
+      default:
+        return FAMILY_DWORD;
+    };
+  }
+
+  /*! Return a type for each register family */
+  INLINE Type getType(RegisterFamily family) {
+    switch (family) {
+      case FAMILY_BOOL: return TYPE_BOOL;
+      case FAMILY_BYTE: return TYPE_U8;
+      case FAMILY_WORD: return TYPE_U16;
+      case FAMILY_DWORD: return TYPE_U32;
+      case FAMILY_QWORD: return TYPE_U64;
+    };
+    return TYPE_U32;
+  }
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_TYPE_HPP__ */
+
diff --git a/backend/src/ir/unit.cpp b/backend/src/ir/unit.cpp
new file mode 100644
index 0000000..4f9d740
--- /dev/null
+++ b/backend/src/ir/unit.cpp
@@ -0,0 +1,61 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file unit.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "ir/unit.hpp"
+#include "ir/function.hpp"
+
+namespace gbe {
+namespace ir {
+
+  Unit::Unit(PointerSize pointerSize) : pointerSize(pointerSize), valid(true) {}
+  Unit::~Unit(void) {
+    for (const auto &pair : functions) GBE_DELETE(pair.second);
+  }
+  Function *Unit::getFunction(const std::string &name) const {
+    auto it = functions.find(name);
+    if (it == functions.end())
+      return NULL;
+    return it->second;
+  }
+  Function *Unit::newFunction(const std::string &name) {
+    auto it = functions.find(name);
+    if (it != functions.end())
+      return NULL;
+    Function *fn = GBE_NEW(Function, name, *this);
+    functions[name] = fn;
+    return fn;
+  }
+  void Unit::newConstant(const char *data,
+                         const std::string &name,
+                         uint32_t size,
+                         uint32_t alignment)
+  {
+    constantSet.append(data, name, size, alignment);
+  }
+
+  std::ostream &operator<< (std::ostream &out, const Unit &unit) {
+    unit.apply([&out] (const Function &fn) { out << fn << std::endl; });
+    return out;
+  }
+} /* namespace ir */
+} /* namespace gbe */
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
new file mode 100644
index 0000000..adebd3f
--- /dev/null
+++ b/backend/src/ir/unit.hpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file unit.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_UNIT_HPP__
+#define __GBE_IR_UNIT_HPP__
+
+#include "ir/constant.hpp"
+#include "ir/register.hpp"
+#include "sys/hash_map.hpp"
+#include "sys/map.hpp"
+
+namespace gbe {
+namespace ir {
+
+  // A unit contains a set of functions
+  class Function;
+
+  /*! Complete unit of compilation. It contains a set of functions and a set of
+   *  constant the functions may refer to.
+   */
+  class Unit : public NonCopyable
+  {
+  public:
+    typedef hash_map<std::string, Function*> FunctionSet;
+    /*! Create an empty unit */
+    Unit(PointerSize pointerSize = POINTER_32_BITS);
+    /*! Release everything (*including* the function pointers) */
+    ~Unit(void);
+    /*! Get the set of functions defined in the unit */
+    const FunctionSet &getFunctionSet(void) const { return functions; }
+    /*! Retrieve the function by its name */
+    Function *getFunction(const std::string &name) const;
+    /*! Return NULL if the function already exists */
+    Function *newFunction(const std::string &name);
+    /*! Create a new constant in the constant set */
+    void newConstant(const char*, const std::string&, uint32_t size, uint32_t alignment);
+    /*! Apply the given functor on all the functions */
+    template <typename T>
+    INLINE void apply(const T &functor) const {
+      for (const auto &pair : functions) functor(*pair.second);
+    }
+    /*! Return the size of the pointers manipulated */
+    INLINE PointerSize getPointerSize(void) const { return pointerSize; }
+    /*! Return the family of registers that contain pointer */
+    INLINE RegisterFamily getPointerFamily(void) const {
+      if (this->getPointerSize() == POINTER_32_BITS)
+        return FAMILY_DWORD;
+      else
+        return FAMILY_QWORD;
+    }
+    /*! Return the constant set */
+    ConstantSet& getConstantSet(void) { return constantSet; }
+    /*! Return the constant set */
+    const ConstantSet& getConstantSet(void) const { return constantSet; }
+    void setValid(bool value) { valid = value; }
+    bool getValid() { return valid; }
+  private:
+    friend class ContextInterface; //!< Can free modify the unit
+    hash_map<std::string, Function*> functions; //!< All the defined functions
+    ConstantSet constantSet; //!< All the constants defined in the unit
+    PointerSize pointerSize; //!< Size shared by all pointers
+    GBE_CLASS(Unit);
+    bool valid;
+  };
+
+  /*! Output the unit string in the given stream */
+  std::ostream &operator<< (std::ostream &out, const Unit &unit);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_UNIT_HPP__ */
diff --git a/backend/src/ir/value.cpp b/backend/src/ir/value.cpp
new file mode 100644
index 0000000..a055bdf
--- /dev/null
+++ b/backend/src/ir/value.cpp
@@ -0,0 +1,607 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file value.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "ir/value.hpp"
+#include "ir/liveness.hpp"
+
+namespace gbe {
+namespace ir {
+
+  /*! To build the chains (i.e. basically the graph of values), we are going to
+   *  iterate on liveout definitions: for each block and for each variable
+   *  (ir::Register) alive at the end of the block (in Block::LiveOut), we are
+   *  computing the set of all possible value definitions. Using these value
+   *  definitions, we will finally transfer these sets to the successors to get
+   *  the ud / du chains
+   *
+   *  LiveOutSet contains the set of definitions for each basic block
+   */
+  class LiveOutSet
+  {
+  public:
+    LiveOutSet(Liveness &liveness, const FunctionDAG &dag);
+    ~LiveOutSet(void);
+    /*! One set per register */
+    typedef set<ValueDef*> RegDefSet;
+    /*! We have one map of liveout register per block */
+    typedef map<Register, RegDefSet*> BlockDefMap;
+    /*! All the block definitions map in the functions */
+    typedef map<const BasicBlock*, BlockDefMap*> FunctionDefMap;
+    /*! Performs the double look-up to get the set of defs per register */
+    RegDefSet &getDefSet(const BasicBlock *bb, Register reg);
+    /*! Build a UD-chain as the union of the predecessor chains */
+    void makeDefSet(DefSet &udChain, const BasicBlock &bb, Register reg);
+    /*! Fast per register definition set allocation */
+    DECL_POOL(RegDefSet, regDefSetPool);
+    /*! Fast register sets allocation */
+    DECL_POOL(BlockDefMap, blockDefMapPool);
+    FunctionDefMap defMap;    //!< All per-block data
+    Liveness &liveness;       //!< Contains LiveOut information
+    const FunctionDAG &dag;   //!< Structure we are building
+  private:
+    /*! Initialize liveOut with the instruction destination values */
+    void initializeInstructionDef(void);
+    /*! Initialize liveOut with the function argument, special and pushed
+     *  registers
+     */
+    void initializeOtherDef(void);
+    /*! Iterate to completely transfer the liveness and get the def sets */
+    void iterateLiveOut(void);
+    /*! Use custom allocators */
+    GBE_CLASS(LiveOutSet);
+  };
+
+  /*! Debug print of the liveout set */
+  std::ostream &operator<< (std::ostream &out, LiveOutSet &set);
+
+  LiveOutSet::LiveOutSet(Liveness &liveness, const FunctionDAG &dag) :
+    liveness(liveness), dag(dag)
+  {
+    this->initializeInstructionDef();
+    this->initializeOtherDef();
+    this->iterateLiveOut();
+  }
+
+  LiveOutSet::RegDefSet &LiveOutSet::getDefSet(const BasicBlock *bb, Register reg)
+  {
+    auto bbIt = defMap.find(bb);
+    GBE_ASSERT(bbIt != defMap.end());
+    auto defIt = bbIt->second->find(reg);
+    GBE_ASSERT(defIt != bbIt->second->end() && defIt->second != NULL);
+    return *defIt->second;
+  }
+
+  void LiveOutSet::makeDefSet(DefSet &udChain, const BasicBlock &bb, Register reg)
+  {
+    // Iterate over all the predecessors
+    const auto &preds = bb.getPredecessorSet();
+    for (const auto &pred : preds) {
+      if (pred->undefPhiRegs.contains(reg))
+        continue;
+      RegDefSet &predDef = this->getDefSet(pred, reg);
+      for (auto def : predDef) udChain.insert(def);
+    }
+
+    // If this is the top block we must take into account both function
+    // arguments and special registers
+    const Function &fn = bb.getParent();
+    if (fn.isEntryBlock(bb) == false) return;
+
+    // Is it a function input?
+    const FunctionArgument *arg = fn.getArg(reg);
+    const PushLocation *pushed = fn.getPushLocation(reg);
+
+    // Is it a pushed register?
+    if (pushed != NULL) {
+      ValueDef *def = const_cast<ValueDef*>(dag.getDefAddress(pushed));
+      udChain.insert(def);
+    }
+    // Is a function argument?
+    else if (arg != NULL) {
+      ValueDef *def = const_cast<ValueDef*>(dag.getDefAddress(arg));
+      udChain.insert(def);
+    }
+    // Is it a special register?
+    else if (fn.isSpecialReg(reg) == true) {
+      ValueDef *def = const_cast<ValueDef*>(dag.getDefAddress(reg));
+      udChain.insert(def);
+    }
+  }
+
+  void LiveOutSet::initializeInstructionDef(void) {
+    const Function &fn = liveness.getFunction();
+
+    // Iterate over each block and initialize the liveOut data
+    fn.foreachBlock([&](const BasicBlock &bb) {
+      GBE_ASSERT(defMap.find(&bb) == defMap.end());
+
+      // Allocate a map of register definitions
+      auto blockDefMap = this->newBlockDefMap();
+      defMap.insert(std::make_pair(&bb, blockDefMap));
+
+      // We only consider liveout registers
+      const auto &info = this->liveness.getBlockInfo(&bb);
+      const auto &liveOut = info.liveOut;
+      for (auto reg : liveOut) {
+        GBE_ASSERT(blockDefMap->find(reg) == blockDefMap->end());
+        auto regDefSet = this->newRegDefSet();
+        blockDefMap->insert(std::make_pair(reg, regDefSet));
+      }
+
+      // Now traverse the blocks backwards and find the definition of each
+      // liveOut register
+      set<Register> defined;
+      for (auto it = --bb.end(); it != bb.end(); --it) {
+        const Instruction &insn = *it;
+        const uint32_t dstNum = insn.getDstNum();
+        for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+          const Register reg = insn.getDst(dstID);
+          // We only take the most recent definition
+          if (defined.contains(reg) == true) continue;
+          // Not in LiveOut, so does not matter
+          if (info.inLiveOut(reg) == false) continue;
+          defined.insert(reg);
+          // Insert the outgoing definition for this register
+          auto regDefSet = blockDefMap->find(reg);
+          ValueDef *def = const_cast<ValueDef*>(this->dag.getDefAddress(&insn, dstID));
+          GBE_ASSERT(regDefSet != blockDefMap->end() && def != NULL);
+          regDefSet->second->insert(def);
+        }
+      }
+    });
+  }
+
+  void LiveOutSet::initializeOtherDef(void) {
+    const Function &fn = liveness.getFunction();
+    const uint32_t argNum = fn.argNum();
+
+    // The first block must also transfer the function arguments
+    const BasicBlock &top = fn.getTopBlock();
+    const Liveness::BlockInfo &info = this->liveness.getBlockInfo(&top);
+    GBE_ASSERT(defMap.contains(&top) == true);
+    auto blockDefMap = defMap.find(&top)->second;
+
+    // Insert all the values that are not overwritten in the block and alive at
+    // the end of it
+    for (uint32_t argID = 0; argID < argNum; ++argID) {
+      const FunctionArgument &arg = fn.getArg(argID);
+      const Register reg = arg.reg;
+      // Do not transfer dead values
+      if (info.inLiveOut(reg) == false) continue;
+      // If we overwrite it, do not transfer the initial value
+      if (info.inVarKill(reg) == true) continue;
+      ValueDef *def = const_cast<ValueDef*>(this->dag.getDefAddress(&arg));
+      auto it = blockDefMap->find(reg);
+      GBE_ASSERT(it != blockDefMap->end());
+      it->second->insert(def);
+    }
+
+    // Now transfer the special registers that are not over-written
+    const uint32_t firstID = fn.getFirstSpecialReg();
+    const uint32_t specialNum = fn.getSpecialRegNum();
+    for (uint32_t regID = firstID; regID < firstID + specialNum; ++regID) {
+      const Register reg(regID);
+      // Do not transfer dead values
+      if (info.inLiveOut(reg) == false) continue;
+      // If we overwrite it, do not transfer the initial value
+      if (info.inVarKill(reg) == true) continue;
+      ValueDef *def = const_cast<ValueDef*>(this->dag.getDefAddress(reg));
+      auto it = blockDefMap->find(reg);
+      GBE_ASSERT(it != blockDefMap->end());
+      it->second->insert(def);
+    }
+
+    // Finally do the same thing with pushed registers
+    const Function::PushMap &pushMap = fn.getPushMap();
+    for (const auto &pushed : pushMap) {
+      const Register reg = pushed.first;
+      // Do not transfer dead values
+      if (info.inLiveOut(reg) == false) continue;
+      // If we overwrite it, do not transfer the initial value
+      if (info.inVarKill(reg) == true) continue;
+      ValueDef *def = const_cast<ValueDef*>(this->dag.getDefAddress(&pushed.second));
+      auto it = blockDefMap->find(reg);
+      GBE_ASSERT(it != blockDefMap->end());
+      it->second->insert(def);
+    }
+  }
+
+  void LiveOutSet::iterateLiveOut(void) {
+    bool changed = true;
+
+    while (changed) {
+      changed = false;
+
+      // Compute the union of the current liveout definitions with the previous
+      // ones. Do not take into account the killed values though
+      liveness.foreach<DF_PRED>([&](Liveness::BlockInfo &curr,
+                                    const Liveness::BlockInfo &pred)
+      {
+        const BasicBlock &bb = curr.bb;
+        const BasicBlock &pbb = pred.bb;
+        for (auto reg : curr.liveOut) {
+          if (pred.inLiveOut(reg) == false) continue;
+          if (curr.inVarKill(reg) == true) continue;
+          RegDefSet &currSet = this->getDefSet(&bb, reg);
+          RegDefSet &predSet = this->getDefSet(&pbb, reg);
+
+          // Transfer the values
+          for (auto def : predSet) {
+            if (currSet.contains(def)) continue;
+            changed = true;
+            currSet.insert(def);
+          }
+        }
+      });
+    }
+  }
+
+  LiveOutSet::~LiveOutSet(void) {
+    for (const auto pair : defMap) {
+      BlockDefMap *block = pair.second;
+      for (auto regSet : *block)
+        this->deleteRegDefSet(regSet.second);
+      this->deleteBlockDefMap(block);
+    }
+  }
+
+  std::ostream &operator<< (std::ostream &out, LiveOutSet &set) {
+    for (const auto &pair : set.defMap) {
+      // To recognize the block, just print its instructions
+      out << "Block:" << std::endl;
+      for (const auto &insn : *pair.first) out << insn << std::endl;
+
+      // Iterate over all alive registers to get their definitions
+      const LiveOutSet::BlockDefMap *defMap = pair.second;
+      if (defMap->size() > 0) out << "LiveSet:" << std::endl;
+      for (const auto &pair : *defMap) {
+        const Register reg = pair.first;
+        const LiveOutSet::RegDefSet *set = pair.second;
+        for (auto def : *set) {
+          const ValueDef::Type type = def->getType();
+          if (type == ValueDef::DEF_FN_ARG)
+            out << "%" << reg << ": " << "function input" << std::endl;
+          else if (type == ValueDef::DEF_FN_PUSHED)
+            out << "%" << reg << ": " << "pushed register" << std::endl;
+          else if (type == ValueDef::DEF_SPECIAL_REG)
+            out << "%" << reg << ": " << "special register" << std::endl;
+          else {
+            const Instruction *insn = def->getInstruction();
+            out << "%" << reg << ": " << insn << " " << *insn << std::endl;
+          }
+        }
+      }
+      out << std::endl;
+    }
+    return out;
+  }
+
+  FunctionDAG::FunctionDAG(Liveness &liveness) :
+    fn(liveness.getFunction())
+  {
+    // We first start with empty chains
+    udEmpty = this->newDefSet();
+    duEmpty = this->newUseSet();
+
+    // First create the chains and insert them in their respective maps
+    fn.foreachInstruction([this](const Instruction &insn) {
+      // sources == value uses
+      const uint32_t srcNum = insn.getSrcNum();
+      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+        ValueUse *valueUse = this->newValueUse(&insn, srcID);
+        useName.insert(std::make_pair(*valueUse, valueUse));
+        udGraph.insert(std::make_pair(*valueUse, udEmpty));
+      }
+      // destinations == value defs
+      const uint32_t dstNum = insn.getDstNum();
+      for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+        ValueDef *valueDef = this->newValueDef(&insn, dstID);
+        defName.insert(std::make_pair(*valueDef, valueDef));
+        duGraph.insert(std::make_pair(*valueDef, duEmpty));
+      }
+    });
+
+    // Function arguments are also value definitions
+    const uint32_t argNum = fn.argNum();
+    for (uint32_t argID = 0; argID < argNum; ++argID) {
+      const FunctionArgument &arg = fn.getArg(argID);
+      ValueDef *valueDef = this->newValueDef(&arg);
+      defName.insert(std::make_pair(*valueDef, valueDef));
+      duGraph.insert(std::make_pair(*valueDef, duEmpty));
+    }
+
+    // Special registers are also definitions
+    const uint32_t firstID = fn.getFirstSpecialReg();
+    const uint32_t specialNum = fn.getSpecialRegNum();
+    for (uint32_t regID = firstID; regID < firstID + specialNum; ++regID) {
+      const Register reg(regID);
+      ValueDef *valueDef = this->newValueDef(reg);
+      defName.insert(std::make_pair(*valueDef, valueDef));
+      duGraph.insert(std::make_pair(*valueDef, duEmpty));
+    }
+
+    // Pushed registers are also definitions
+    const Function::PushMap &pushMap = fn.getPushMap();
+    for (const auto &pushed : pushMap) {
+      ValueDef *valueDef = this->newValueDef(&pushed.second);
+      defName.insert(std::make_pair(*valueDef, valueDef));
+      duGraph.insert(std::make_pair(*valueDef, duEmpty));
+    }
+
+    // We create the liveOutSet to help us transfer the definitions
+    LiveOutSet liveOutSet(liveness, *this);
+
+    // Build UD chains traversing the blocks top to bottom
+    fn.foreachBlock([&](const BasicBlock &bb) {
+      // Track the allocated chains to be able to reuse them
+      map<Register, DefSet*> allocated;
+      // Some chains may be not used (ie they are dead). We track them to be
+      // able to deallocate them later
+      set<DefSet*> unused;
+
+      // For each instruction build the UD chains
+      const_cast<BasicBlock&>(bb).foreach([&](const Instruction &insn) {
+        // Instruction sources consumes definitions
+        const uint32_t srcNum = insn.getSrcNum();
+        for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+          const Register src = insn.getSrc(srcID);
+          const ValueUse use(&insn, srcID);
+          auto ud = udGraph.find(use);
+          GBE_ASSERT(ud != udGraph.end());
+
+          // We already allocate the ud chain for this register
+          auto it = allocated.find(src);
+          if (it != allocated.end()) {
+            udGraph.erase(ud);
+            udGraph.insert(std::make_pair(use, it->second));
+            if (unused.contains(it->second))
+              unused.erase(it->second);
+          }
+          // Create a new one from the predecessor chains (upward used value)
+          else {
+            DefSet *udChain = this->newDefSet();
+            liveOutSet.makeDefSet(*udChain, bb, src);
+            allocated.insert(std::make_pair(src, udChain));
+            ud->second = udChain;
+          }
+        }
+
+        // Instruction destinations create new chains
+        const uint32_t dstNum = insn.getDstNum();
+        for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+          const Register dst = insn.getDst(dstID);
+          ValueDef *def = const_cast<ValueDef*>(this->getDefAddress(&insn, dstID));
+          DefSet *udChain = this->newDefSet();
+          udChain->insert(def);
+          unused.insert(udChain);
+          // Remove the previous definition if any
+          if (allocated.contains(dst) == true)
+            allocated.erase(dst);
+          allocated.insert(std::make_pair(dst, udChain));
+        }
+      });
+
+      // Deallocate unused chains
+      for (auto set : unused) this->deleteDefSet(set);
+    });
+
+    // Build the DU chains from the UD ones
+    fn.foreachInstruction([&](const Instruction &insn) {
+
+      // For each value definition of each source, we push back this use
+      const uint32_t srcNum = insn.getSrcNum();
+      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+        ValueUse *use = const_cast<ValueUse*>(getUseAddress(&insn, srcID));
+
+        // Find all definitions for this source
+        const auto &defs = this->getDef(&insn, srcID);
+        for (auto def : defs) {
+          auto uses = duGraph.find(*def);
+          UseSet *du = uses->second;
+          GBE_ASSERT(uses != duGraph.end());
+          if (du == duEmpty) {
+            duGraph.erase(*def);
+            du = this->newUseSet();
+            duGraph.insert(std::make_pair(*def, du));
+          }
+          du->insert(use);
+        }
+      }
+    });
+
+    // Allocate the set of uses and defs per register
+    const uint32_t regNum = fn.regNum();
+    for (uint32_t regID = 0; regID < regNum; ++regID) {
+      const Register reg(regID);
+      UseSet *useSet = GBE_NEW_NO_ARG(UseSet);
+      DefSet *defSet = GBE_NEW_NO_ARG(DefSet);
+      regUse.insert(std::make_pair(reg, useSet));
+      regDef.insert(std::make_pair(reg, defSet));
+    }
+
+    // Fill use sets (one per register)
+    for (auto &useSet : duGraph) {
+      for (auto use : *useSet.second) {
+        const Register reg = use->getRegister();
+        auto it = regUse.find(reg);
+        GBE_ASSERT(it != regUse.end() && it->second != NULL);
+        it->second->insert(use);
+      }
+    }
+
+    // Fill def sets (one per register)
+    for (auto &defSet : udGraph) {
+      for (auto def : *defSet.second) {
+        const Register reg = def->getRegister();
+        auto it = regDef.find(reg);
+        GBE_ASSERT(it != regDef.end() && it->second != NULL);
+        it->second->insert(def);
+      }
+    }
+  }
+
+/*! Helper to deallocate objects */
+#define PTR_RELEASE(TYPE, VAR) \
+  do { \
+    if (VAR && destroyed.contains(VAR) == false) { \
+      destroyed.insert(VAR); \
+      delete##TYPE(VAR); \
+    } \
+  } while (0)
+
+  FunctionDAG::~FunctionDAG(void) {
+
+    // We track the already destroyed pointers
+    set<void*> destroyed;
+
+    // Release the empty ud-chains and du-chains
+    PTR_RELEASE(DefSet, udEmpty);
+    PTR_RELEASE(UseSet, duEmpty);
+
+    // We free all the ud-chains
+    for (const auto &pair : udGraph) {
+      auto defs = pair.second;
+      if (destroyed.contains(defs)) continue;
+      for (auto def : *defs) PTR_RELEASE(ValueDef, def);
+      PTR_RELEASE(DefSet, defs);
+    }
+
+    // We free all the du-chains
+    for (const auto &pair : duGraph) {
+      auto uses = pair.second;
+      if (destroyed.contains(uses)) continue;
+      for (auto use : *uses) PTR_RELEASE(ValueUse, use);
+      PTR_RELEASE(UseSet, uses);
+    }
+
+    // Release all the use and definition sets per register
+    for (const auto &pair : regUse) GBE_SAFE_DELETE(pair.second);
+    for (const auto &pair : regDef) GBE_SAFE_DELETE(pair.second);
+  }
+#undef PTR_RELEASE
+
+  const UseSet &FunctionDAG::getUse(const ValueDef &def) const {
+    auto it = duGraph.find(def);
+    GBE_ASSERT(it != duGraph.end());
+    return *it->second;
+  }
+  const UseSet &FunctionDAG::getUse(const Instruction *insn, uint32_t dstID) const {
+    return this->getUse(ValueDef(insn, dstID));
+  }
+  const UseSet &FunctionDAG::getUse(const FunctionArgument *arg) const {
+    return this->getUse(ValueDef(arg));
+  }
+  const UseSet &FunctionDAG::getUse(const Register &reg) const {
+    return this->getUse(ValueDef(reg));
+  }
+  const DefSet &FunctionDAG::getDef(const ValueUse &use) const {
+    auto it = udGraph.find(use);
+    GBE_ASSERT(it != udGraph.end());
+    return *it->second;
+  }
+  const DefSet &FunctionDAG::getDef(const Instruction *insn, uint32_t srcID) const {
+    return this->getDef(ValueUse(insn, srcID));
+  }
+  const UseSet *FunctionDAG::getRegUse(const Register &reg) const {
+    auto it = regUse.find(reg);
+    GBE_ASSERT(it != regUse.end());
+    return it->second;
+  }
+  const DefSet *FunctionDAG::getRegDef(const Register &reg) const {
+    auto it = regDef.find(reg);
+    GBE_ASSERT(it != regDef.end());
+    return it->second;
+  }
+
+  const ValueDef *FunctionDAG::getDefAddress(const ValueDef &def) const {
+    auto it = defName.find(def);
+    GBE_ASSERT(it != defName.end() && it->second != NULL);
+    return it->second;
+  }
+  const ValueDef *FunctionDAG::getDefAddress(const PushLocation *pushed) const {
+    return this->getDefAddress(ValueDef(pushed));
+  }
+  const ValueDef *FunctionDAG::getDefAddress(const Instruction *insn, uint32_t dstID) const {
+    return this->getDefAddress(ValueDef(insn, dstID));
+  }
+  const ValueDef *FunctionDAG::getDefAddress(const FunctionArgument *arg) const {
+    return this->getDefAddress(ValueDef(arg));
+  }
+  const ValueDef *FunctionDAG::getDefAddress(const Register &reg) const {
+    return this->getDefAddress(ValueDef(reg));
+  }
+  const ValueUse *FunctionDAG::getUseAddress(const Instruction *insn, uint32_t srcID) const {
+    const ValueUse use(insn, srcID);
+    auto it = useName.find(use);
+    GBE_ASSERT(it != useName.end() && it->second != NULL);
+    return it->second;
+  }
+
+  std::ostream &operator<< (std::ostream &out, const FunctionDAG &dag) {
+    const Function &fn = dag.getFunction();
+
+    // Print all uses for the definitions and all definitions for each uses
+    fn.foreachInstruction([&](const Instruction &insn) {
+      out << &insn << ": " << insn << std::endl;
+
+      // Display the set of definition for each destination
+      const uint32_t dstNum = insn.getDstNum();
+      if (dstNum > 0) out << "USES:" << std::endl;
+      for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+        const Register reg = insn.getDst(dstID);
+        const auto &uses = dag.getUse(&insn, dstID);
+        for (auto use : uses) {
+          const Instruction *other = use->getInstruction();
+          out << "  %" << reg << " " << other << ": " << *other << std::endl;
+        }
+      }
+
+      // Display the set of definitions for each source
+      const uint32_t srcNum = insn.getSrcNum();
+      if (srcNum > 0) out << "DEFS:" << std::endl;
+      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+        const Register reg = insn.getSrc(srcID);
+        const auto &defs = dag.getDef(&insn, srcID);
+        for (auto def : defs) {
+          if (def->getType() == ValueDef::DEF_FN_PUSHED)
+            out << "  %" << reg << " # pushed register" << std::endl;
+          else if (def->getType() == ValueDef::DEF_FN_ARG)
+            out << "  %" << reg << " # function argument" << std::endl;
+          else if (def->getType() == ValueDef::DEF_SPECIAL_REG)
+            out << "  %" << reg << " # special register" << std::endl;
+          else {
+            const Instruction *other = def->getInstruction();
+            out << "  %" << reg << " " << other << ": " << *other << std::endl;
+          }
+        }
+      }
+      out << std::endl;
+    });
+
+    return out;
+  }
+
+} /* namespace ir */
+} /* namespace gbe */
+
diff --git a/backend/src/ir/value.hpp b/backend/src/ir/value.hpp
new file mode 100644
index 0000000..47b9048
--- /dev/null
+++ b/backend/src/ir/value.hpp
@@ -0,0 +1,266 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file value.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_VALUE_HPP__
+#define __GBE_IR_VALUE_HPP__
+
+#include "ir/instruction.hpp"
+#include "ir/function.hpp"
+#include "sys/set.hpp"
+#include "sys/map.hpp"
+
+namespace gbe {
+namespace ir {
+
+  // Make UD-Chain and DU-Chain computations faster and easier
+  class Liveness;
+
+  /*! A value definition is a destination of an instruction or a function
+   *  argument. Since we support multiple destinations, we also add the
+   *  destination ID.
+   */
+  class ValueDef
+  {
+  public:
+    /*! Discriminates the kind of values */
+    enum Type : uint32_t {
+      DEF_FN_ARG = 0,
+      DEF_FN_PUSHED = 1,
+      DEF_INSN_DST = 2,
+      DEF_SPECIAL_REG = 3
+    };
+    /*! Build a value from an instruction destination */
+    explicit ValueDef(const Instruction *insn, uint32_t dstID = 0u) :
+      type(DEF_INSN_DST)
+    {
+      this->data.insn = insn;
+      this->data.dstID = dstID;
+    }
+    /*! Build a value from a function argument */
+    explicit ValueDef(const FunctionArgument *arg) : type(DEF_FN_ARG) {
+      this->data.arg = arg;
+    }
+    /*! Build a value from a pushed register */
+    explicit ValueDef(const PushLocation *pushed) : type(DEF_FN_PUSHED) {
+      this->data.pushed = pushed;
+    }
+    /*! Build a value from a special register */
+    explicit ValueDef(const Register &reg) : type(DEF_SPECIAL_REG) {
+      this->data.regID = uint32_t(reg);
+    }
+    /*! Get the type of the value */
+    INLINE Type getType(void) const { return type; }
+    /*! Get the instruction (only if this is a instruction value) */
+    INLINE const Instruction *getInstruction(void) const {
+      GBE_ASSERT(type == DEF_INSN_DST);
+      return data.insn;
+    }
+    /*! Get the destination ID (only if this is a instruction value) */
+    INLINE uint32_t getDstID(void) const {
+      GBE_ASSERT(type == DEF_INSN_DST);
+      return data.dstID;
+    }
+    /*! Get the function input (only if this is a function argument) */
+    INLINE const FunctionArgument *getFunctionArgument(void) const {
+      GBE_ASSERT(type == DEF_FN_ARG);
+      return data.arg;
+    }
+    /*! Get the pushed location */
+    INLINE const PushLocation *getPushLocation(void) const {
+      GBE_ASSERT(type == DEF_FN_PUSHED);
+      return data.pushed;
+    }
+    /*! Get the special register */
+    INLINE Register getSpecialReg(void) const {
+      GBE_ASSERT(type == DEF_SPECIAL_REG);
+      return Register(data.regID);
+    }
+    /*! Retrieve the register associated to the definition */
+    INLINE Register getRegister(void) const {
+      if (type == DEF_SPECIAL_REG)
+        return Register(data.regID);
+      else if (type == DEF_FN_ARG)
+        return data.arg->reg;
+      else if (type == DEF_FN_PUSHED)
+        return data.pushed->getRegister();
+      else
+        return data.insn->getDst(data.dstID);
+    }
+
+  private:
+    /*! Instruction or function argument */
+    union Data {
+      /*! Instruction destination or ... */
+      struct {
+        const Instruction *insn; //<! Instruction itself
+        uint32_t dstID;          //<! Which destination we take into account
+      };
+      /*! Pushed value */
+      const PushLocation *pushed;
+      /*! ... function argument or ... */
+      const FunctionArgument *arg;
+      /*! ... special register */
+      uint32_t regID;
+    } data;
+    /*!< Function argument or instruction dst? */
+    Type type;
+    GBE_CLASS(ValueDef); // Use gbe allocators
+  };
+
+  /*! Compare two value definitions (used in maps) */
+  INLINE bool operator< (const ValueDef &def0, const ValueDef &def1) {
+    const ValueDef::Type type0 = def0.getType();
+    const ValueDef::Type type1 = def1.getType();
+    if (type0 != type1) return uint32_t(type0) < uint32_t(type1);
+    if (type0 == ValueDef::DEF_FN_ARG) {
+      const FunctionArgument *in0 = def0.getFunctionArgument();
+      const FunctionArgument *in1 = def1.getFunctionArgument();
+      return uintptr_t(in0) < uintptr_t(in1);
+    } else if (type0 == ValueDef::DEF_FN_PUSHED) {
+      const PushLocation *pushed0 = def0.getPushLocation();
+      const PushLocation *pushed1 = def1.getPushLocation();
+      return uintptr_t(pushed0) < uintptr_t(pushed1);
+    } else if (type0 == ValueDef::DEF_SPECIAL_REG) {
+      const Register reg0 = def0.getSpecialReg();
+      const Register reg1 = def1.getSpecialReg();
+      return uint32_t(reg0) < uint32_t(reg1);
+    } else {
+      const Instruction *insn0 = def0.getInstruction();
+      const Instruction *insn1 = def1.getInstruction();
+      if (insn0 != insn1) return uintptr_t(insn0) < uintptr_t(insn1);
+      const uint32_t dst0 = def0.getDstID();
+      const uint32_t dst1 = def1.getDstID();
+      return dst0 < dst1;
+    }
+  }
+
+  /*! A value use describes a instruction source. This is the place where a
+   *  value is used
+   */
+  class ValueUse
+  {
+  public:
+    /*! Build a value use */
+    explicit ValueUse(const Instruction *insn, uint32_t srcID = 0u) :
+      insn(insn), srcID(srcID) {}
+    /*! Get the instruction of the use */
+    const Instruction *getInstruction(void) const { return insn; }
+    /*! Get the source index for this use */
+    uint32_t getSrcID(void) const { return srcID; }
+    /*! Get the register for this use */
+    Register getRegister(void) const { return insn->getSrc(srcID); }
+  private:
+    const Instruction *insn; //!< Instruction where the value is used
+    uint32_t srcID;          //!< Index of the source in the instruction
+    GBE_CLASS(ValueUse);     // Use gbe allocators
+  };
+
+  /*! Compare two value uses (used in maps) */
+  INLINE bool operator< (const ValueUse &use0, const ValueUse &use1) {
+    const Instruction *insn0 = use0.getInstruction();
+    const Instruction *insn1 = use1.getInstruction();
+    if (insn0 != insn1) return uintptr_t(insn0) < uintptr_t(insn1);
+    const uint32_t src0 = use0.getSrcID();
+    const uint32_t src1 = use1.getSrcID();
+    return src0 < src1;
+  }
+
+  /*! All uses of a definition */
+  typedef set<ValueUse*> UseSet;
+  /*! All possible definitions for a use */
+  typedef set<ValueDef*> DefSet;
+
+  /*! Get the chains (in both directions) for the complete program. This data
+   *  structure is unfortunately way too brutal. Using std::sets all over the
+   *  place just burns a huge amount of memory. There is work to do to decrease
+   *  the memory footprint
+   */
+  class FunctionDAG : public NonCopyable
+  {
+  public:
+    /*! Build the complete DU/UD graphs for the program included in liveness */
+    FunctionDAG(Liveness &liveness);
+    /*! Free all the resources */
+    ~FunctionDAG(void);
+    /*! Get the du-chain for the definition */
+    const UseSet &getUse(const ValueDef &def) const;
+    /*! Get the du-chain for the given instruction and destination */
+    const UseSet &getUse(const Instruction *insn, uint32_t dstID) const;
+    /*! Get the du-chain for the given function input */
+    const UseSet &getUse(const FunctionArgument *arg) const;
+    /*! Get the du-chain for the given pushed location */
+    const UseSet &getUse(const PushLocation *pushed) const;
+    /*! Get the du-chain for the given special register */
+    const UseSet &getUse(const Register &reg) const;
+    /*! Get the ud-chain for the given use */
+    const DefSet &getDef(const ValueUse &use) const;
+    /*! Get the ud-chain for the instruction and source */
+    const DefSet &getDef(const Instruction *insn, uint32_t srcID) const;
+    /*! Get the pointer to the definition *as stored in the DAG* */
+    const ValueDef *getDefAddress(const ValueDef &def) const;
+    /*! Get the pointer to the definition *as stored in the DAG* */
+    const ValueDef *getDefAddress(const PushLocation *pushed) const;
+    /*! Get the pointer to the definition *as stored in the DAG* */
+    const ValueDef *getDefAddress(const Instruction *insn, uint32_t dstID) const;
+    /*! Get the pointer to the definition *as stored in the DAG* */
+    const ValueDef *getDefAddress(const FunctionArgument *input) const;
+    /*! Get the pointer to the definition *as stored in the DAG* */
+    const ValueDef *getDefAddress(const Register &reg) const;
+    /*! Get the pointer to the use *as stored in the DAG* */
+    const ValueUse *getUseAddress(const Instruction *insn, uint32_t srcID) const;
+    /*! Get the set of all uses for the register */
+    const UseSet *getRegUse(const Register &reg) const;
+    /*! Get the set of all definitions for the register */
+    const DefSet *getRegDef(const Register &reg) const;
+    /*! Get the function we have the graph for */
+    INLINE const Function &getFunction(void) const { return fn; }
+    /*! The DefSet for each definition use */
+    typedef map<ValueUse, DefSet*> UDGraph;
+    /*! The UseSet for each definition */
+    typedef map<ValueDef, UseSet*> DUGraph;
+  private:
+    UDGraph udGraph;                   //!< All the UD chains
+    DUGraph duGraph;                   //!< All the DU chains
+    DefSet *udEmpty;                   //!< Void use set
+    UseSet *duEmpty;                   //!< Void def set
+    ValueDef *undefined;               //!< Undefined value
+    map<ValueUse, ValueUse*> useName;  //!< Get the ValueUse pointer from the value
+    map<ValueDef, ValueDef*> defName;  //!< Get the ValueDef pointer from the value
+    map<Register, UseSet*> regUse;     //!< All uses of registers
+    map<Register, DefSet*> regDef;     //!< All defs of registers
+    DECL_POOL(ValueDef, valueDefPool); //!< Fast ValueDef allocation
+    DECL_POOL(ValueUse, valueUsePool); //!< Fast ValueUse allocation
+    DECL_POOL(DefSet, udChainPool);    //!< Fast DefSet allocation
+    DECL_POOL(UseSet, duChainPool);    //!< Fast UseSet allocation
+    const Function &fn;                //!< Function we are referring to
+    GBE_CLASS(FunctionDAG);            //   Use internal allocators
+  };
+
+  /*! Pretty print of the function DAG */
+  std::ostream &operator<< (std::ostream &out, const FunctionDAG &dag);
+
+} /* namespace ir */
+} /* namespace gbe */
+
+#endif /* __GBE_IR_VALUE_HPP__ */
+
diff --git a/backend/src/llvm/llvm_barrier_nodup.cpp b/backend/src/llvm/llvm_barrier_nodup.cpp
new file mode 100644
index 0000000..791df00
--- /dev/null
+++ b/backend/src/llvm/llvm_barrier_nodup.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * \file llvm_barrier_nodup.cpp
+ *
+ *  This pass is to remove or add noduplicate function attribute for barrier functions.
+ *  Basically, we want to set NoDuplicate for those __gen_barrier_xxx functions. But if
+ *  a sub function calls those barrier functions, the sub function will not be inlined
+ *  in llvm's inlining pass. This is what we don't want. As inlining such a function in
+ *  the caller is safe, we just don't want it to duplicate the call. So Introduce this
+ *  pass to remove the NoDuplicate function attribute before the inlining pass and restore
+ *  it after.
+ *  
+ */
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Attributes.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+
+
+using namespace llvm;
+
+namespace gbe {
+    class BarrierNodup : public ModulePass
+    {
+    public:
+      static char ID;
+      BarrierNodup(bool nodup) :
+        ModulePass(ID), nodup(nodup) {}
+
+      void getAnalysisUsage(AnalysisUsage &AU) const {
+
+      }
+
+      virtual const char *getPassName() const {
+        return "SPIR backend: set barrier no duplicate attr";
+      }
+
+      virtual bool runOnModule(Module &M)
+      {
+        using namespace llvm;
+        bool changed = false;
+        for (auto &F : M) {
+          if (F.getName() == "__gen_ocl_barrier_local_and_global" ||
+              F.getName() == "__gen_ocl_barrier_local"            ||
+              F.getName() == "__gen_ocl_barrier_global") {
+            if (nodup) {
+              if (!F.hasFnAttribute(Attribute::NoDuplicate)) {
+                F.addFnAttr(Attribute::NoDuplicate);
+                changed = true;
+              }
+            } else {
+              if (F.hasFnAttribute(Attribute::NoDuplicate)) {
+                auto attrs = F.getAttributes();
+                F.setAttributes(attrs.removeAttribute(M.getContext(),
+                                AttributeSet::FunctionIndex,
+                                Attribute::NoDuplicate));
+                changed = true;
+              }
+            }
+          }
+        }
+
+        return changed;
+      }
+    private:
+      bool nodup;
+    };
+
+
+    ModulePass *createBarrierNodupPass(bool Nodup) {
+      return new BarrierNodup(Nodup);
+    }
+
+    char BarrierNodup::ID = 0;
+} // end namespace
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
new file mode 100644
index 0000000..6cb3834
--- /dev/null
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -0,0 +1,3628 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file llvm_gen_backend.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/* Transform the LLVM IR code into Gen IR code i.e. our temporary representation
+ * for programs running on Gen.
+ *
+ * Overview
+ * ========
+ *
+ * This code is mostly inspired by the (now defunct and replaced by CppBackend)
+ * CBackend. Basically, there are two ways to transform LLVM code into machine
+ * code (or anything else)
+ * - You write a complete LLVM backend by the book. LLVM proposes a lot of
+ *   useful tools to do so. This is obviously the path chosen by all CPU guys
+ *   but also by AMD and nVidia which both use the backend infrastructure to
+ *   output their own intermediate language. The good point is that you can
+ *   reuse a lot of tools (like proper PHI elimination with phi congruence and
+ *   global copy propagation a la Chaitin). Bad points are:
+ *     1/ It is a *long* journey to generate anything.
+ *     2/ More importantly, the code is hugely biased towards CPUs. Typically,
+ *        the way registers are defined do not fit well Gen register file (which
+ *        is really more like a regular piece of memory). Same issue apply for
+ *        predicated instructions with mask which is a bit boring to use with
+ *        SSA. Indeed, since DAGSelection still manipulates SSA values, anything
+ *        predicated requires to insert extra sources
+ * - You write function passes to do the translation yourself. Obviously, you
+ *   reinvent the wheel. However, it is easy to do and easier to maintain
+ *   (somehow)
+ *
+ * So, the code here just traverses LLVM asm and generates our own ISA. The
+ * generated code is OK even if a global copy propagation pass is still overdue.
+ * Right now, it is pretty straighforward and simplistic in that regard
+ *
+ * About Clang and the ABI / target
+ * ================================
+ *
+ * A major question is: how did we actually generate this LLVM code from OpenCL?
+ * Well, thing is that there is no generic target in LLVM since there are many
+ * dependencies on endianness or ABIs. Fortunately, the ptx (and nvptx for LLVM
+ * 3.2) profile is pretty well adapted to our needs since NV and Gen GPU are
+ * kind of similar, or at least they are similar enough to share the same front
+ * end.
+ *
+ * Problems
+ * ========
+ *
+ * - Several things regarding constants like ConstantExpr are not properly handled.
+ * - ptx front end generates function calls. Since we do not support them yet,
+ *   the user needs to force the inlining of all functions. If a function call
+ *   is intercepted, we just abort
+ */
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#else
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Instructions.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/InlineAsm.h"
+#else
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/InlineAsm.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/ConstantsScanner.h"
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+#include "llvm/IR/Mangler.h"
+#else
+#include "llvm/Target/Mangler.h"
+#endif
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#if !defined(LLVM_VERSION_MAJOR) || (LLVM_VERSION_MINOR == 1)
+#include "llvm/Target/TargetData.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/DataLayout.h"
+#endif
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
+#else
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#endif
+
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR <= 2)
+#include "llvm/Support/InstVisitor.h"
+#elif LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/InstVisitor.h"
+#else
+#include "llvm/InstVisitor.h"
+#endif
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/SourceMgr.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "ir/context.hpp"
+#include "ir/unit.hpp"
+#include "ir/liveness.hpp"
+#include "ir/value.hpp"
+#include "sys/set.hpp"
+#include "sys/cvar.hpp"
+#include "backend/program.h"
+#include <sstream>
+
+/* Not defined for LLVM 3.0 */
+#if !defined(LLVM_VERSION_MAJOR)
+#define LLVM_VERSION_MAJOR 3
+#endif /* !defined(LLVM_VERSION_MAJOR) */
+
+#if !defined(LLVM_VERSION_MINOR)
+#define LLVM_VERSION_MINOR 0
+#endif /* !defined(LLVM_VERSION_MINOR) */
+
+#if (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR < 3)
+#error "Only LLVM 3.3 and newer are supported"
+#endif /* (LLVM_VERSION_MAJOR != 3) || (LLVM_VERSION_MINOR > 4) */
+
+using namespace llvm;
+
+namespace gbe
+{
+  /*! Gen IR manipulates only scalar types */
+  static bool isScalarType(const Type *type)
+  {
+    return type->isFloatTy()   ||
+           type->isIntegerTy() ||
+           type->isDoubleTy()  ||
+           type->isPointerTy();
+  }
+
+  /*! LLVM IR Type to Gen IR type translation */
+  static ir::Type getType(ir::Context &ctx, const Type *type)
+  {
+    GBE_ASSERT(isScalarType(type));
+    if (type->isFloatTy() == true)
+      return ir::TYPE_FLOAT;
+    if (type->isDoubleTy() == true)
+      return ir::TYPE_DOUBLE;
+    if (type->isPointerTy() == true) {
+      if (ctx.getPointerSize() == ir::POINTER_32_BITS)
+        return ir::TYPE_U32;
+      else
+        return ir::TYPE_U64;
+    }
+    GBE_ASSERT(type->isIntegerTy() == true);
+    if (type == Type::getInt1Ty(type->getContext()))
+      return ir::TYPE_BOOL;
+    if (type == Type::getInt8Ty(type->getContext()))
+      return ir::TYPE_S8;
+    if (type == Type::getInt16Ty(type->getContext()))
+      return ir::TYPE_S16;
+    if (type == Type::getInt32Ty(type->getContext()))
+      return ir::TYPE_S32;
+    if (type == Type::getInt64Ty(type->getContext()))
+      return ir::TYPE_S64;
+    return ir::TYPE_LARGE_INT;
+  }
+
+  /*! LLVM IR Type to Gen IR unsigned type translation */
+  static ir::Type getUnsignedType(ir::Context &ctx, const Type *type)
+  {
+    GBE_ASSERT(type->isIntegerTy() == true);
+    if (type == Type::getInt1Ty(type->getContext()))
+      return ir::TYPE_BOOL;
+    if (type == Type::getInt8Ty(type->getContext()))
+      return ir::TYPE_U8;
+    if (type == Type::getInt16Ty(type->getContext()))
+      return ir::TYPE_U16;
+    if (type == Type::getInt32Ty(type->getContext()))
+      return ir::TYPE_U32;
+    if (type == Type::getInt64Ty(type->getContext()))
+      return ir::TYPE_U64;
+    ctx.getUnit().setValid(false);
+    return ir::TYPE_U64;
+  }
+
+  /*! Type to register family translation */
+  static ir::RegisterFamily getFamily(ir::Context &ctx, const Type *type)
+  {
+    GBE_ASSERT(isScalarType(type) == true);
+    if (type == Type::getInt1Ty(type->getContext()))
+      return ir::FAMILY_BOOL;
+    if (type == Type::getInt8Ty(type->getContext()))
+      return ir::FAMILY_BYTE;
+    if (type == Type::getInt16Ty(type->getContext()))
+      return ir::FAMILY_WORD;
+    if (type == Type::getInt32Ty(type->getContext()) || type->isFloatTy())
+      return ir::FAMILY_DWORD;
+    if (type == Type::getInt64Ty(type->getContext()) || type->isDoubleTy())
+      return ir::FAMILY_QWORD;
+    if (type->isPointerTy())
+      return ctx.getPointerFamily();
+    ctx.getUnit().setValid(false);
+    return ir::FAMILY_BOOL;
+  }
+
+  /*! Get number of element to process dealing either with a vector or a scalar
+   *  value
+   */
+  static ir::Type getVectorInfo(ir::Context &ctx, Type *llvmType, Value *value, uint32_t &elemNum, bool useUnsigned = false)
+  {
+    ir::Type type;
+    if (llvmType->isVectorTy() == true) {
+      VectorType *vectorType = cast<VectorType>(llvmType);
+      Type *elementType = vectorType->getElementType();
+      elemNum = vectorType->getNumElements();
+      if (useUnsigned)
+        type = getUnsignedType(ctx, elementType);
+      else
+        type = getType(ctx, elementType);
+    } else {
+      elemNum = 1;
+      if (useUnsigned)
+        type = getUnsignedType(ctx, llvmType);
+      else
+        type = getType(ctx, llvmType);
+    }
+    return type;
+  }
+
+  /*! OCL to Gen-IR address type */
+  static INLINE ir::AddressSpace addressSpaceLLVMToGen(unsigned llvmMemSpace) {
+    switch (llvmMemSpace) {
+      case 0: return ir::MEM_PRIVATE;
+      case 1: return ir::MEM_GLOBAL;
+      case 2: return ir::MEM_CONSTANT;
+      case 3: return ir::MEM_LOCAL;
+      case 4: return ir::IMAGE;
+    }
+    GBE_ASSERT(false);
+    return ir::MEM_GLOBAL;
+  }
+
+  static Constant *extractConstantElem(Constant *CPV, uint32_t index) {
+    ConstantVector *CV = dyn_cast<ConstantVector>(CPV);
+    GBE_ASSERT(CV != NULL);
+#if GBE_DEBUG
+    const uint32_t elemNum = CV->getNumOperands();
+    GBE_ASSERTM(index < elemNum, "Out-of-bound constant vector access");
+#endif /* GBE_DEBUG */
+    CPV = cast<Constant>(CV->getOperand(index));
+    return CPV;
+  }
+
+  /*! Handle the LLVM IR Value to Gen IR register translation. This has 2 roles:
+   *  - Split the LLVM vector into several scalar values
+   *  - Handle the transparent copies (bitcast or use of intrincics functions
+   *    like get_local_id / get_global_id
+   */
+  class RegisterTranslator
+  {
+  public:
+    /*! Indices will be zero for scalar values */
+    typedef std::pair<Value*, uint32_t> ValueIndex;
+    RegisterTranslator(ir::Context &ctx) : ctx(ctx) {}
+
+    /*! Empty the maps */
+    void clear(void) {
+      valueMap.clear();
+      scalarMap.clear();
+    }
+    /*! Some values will not be allocated. For example, a bit-cast destination
+     *  like: %fake = bitcast %real or a vector insertion since we do not have
+     *  vectors in Gen-IR
+     */
+    void newValueProxy(Value *real,
+                       Value *fake,
+                       uint32_t realIndex = 0u,
+                       uint32_t fakeIndex = 0u) {
+      const ValueIndex key(fake, fakeIndex);
+      const ValueIndex value(real, realIndex);
+      GBE_ASSERT(valueMap.find(key) == valueMap.end()); // Do not insert twice
+      valueMap[key] = value;
+    }
+    /*! Mostly used for the preallocated registers (lids, gids) */
+    void newScalarProxy(ir::Register reg, Value *value, uint32_t index = 0u) {
+      const ValueIndex key(value, index);
+      GBE_ASSERT(scalarMap.find(key) == scalarMap.end());
+      scalarMap[key] = reg;
+    }
+    /*! Allocate a new scalar register */
+    ir::Register newScalar(Value *value, Value *key = NULL, uint32_t index = 0u, bool uniform = false)
+    {
+      // we don't allow normal constant, but GlobalValue is a special case,
+      // it needs a register to store its address
+      GBE_ASSERT(! (isa<Constant>(value) && !isa<GlobalValue>(value)));
+      Type *type = value->getType();
+      auto typeID = type->getTypeID();
+      switch (typeID) {
+        case Type::IntegerTyID:
+        case Type::FloatTyID:
+        case Type::DoubleTyID:
+        case Type::PointerTyID:
+          GBE_ASSERT(index == 0);
+          return this->_newScalar(value, key, type, index, uniform);
+          break;
+        case Type::VectorTyID:
+        {
+          auto vectorType = cast<VectorType>(type);
+          auto elementType = vectorType->getElementType();
+          auto elementTypeID = elementType->getTypeID();
+          if (elementTypeID != Type::IntegerTyID &&
+              elementTypeID != Type::FloatTyID &&
+              elementTypeID != Type::DoubleTyID)
+            GBE_ASSERTM(false, "Vectors of elements are not supported");
+            return this->_newScalar(value, key, elementType, index, uniform);
+          break;
+        }
+        default: NOT_SUPPORTED;
+      };
+      return ir::Register();
+    }
+
+    /*! iterating in the value map to get the final real register */
+    void getRealValue(Value* &value, uint32_t& index) {
+      auto end = valueMap.end();
+      for (;;) {
+        auto it = valueMap.find(std::make_pair(value, index));
+        if (it == end)
+          break;
+        else {
+          value = it->second.first;
+          index = it->second.second;
+        }
+      }
+    }
+
+    /*! Get the register from the given value at given index possibly iterating
+     *  in the value map to get the final real register
+     */
+    ir::Register getScalar(Value *value, uint32_t index = 0u) {
+      getRealValue(value, index);
+
+      const auto key = std::make_pair(value, index);
+      GBE_ASSERT(scalarMap.find(key) != scalarMap.end());
+      return scalarMap[key];
+    }
+    /*! Insert a given register at given Value position */
+    void insertRegister(const ir::Register &reg, Value *value, uint32_t index) {
+      const auto key = std::make_pair(value, index);
+      GBE_ASSERT(scalarMap.find(key) == scalarMap.end());
+      scalarMap[key] = reg;
+    }
+    /*! Says if the value exists. Otherwise, it is undefined */
+    bool valueExists(Value *value, uint32_t index) {
+      getRealValue(value, index);
+
+      const auto key = std::make_pair(value, index);
+      return scalarMap.find(key) != scalarMap.end();
+    }
+    /*! if it's a undef const value, return true. Otherwise, return false. */
+    bool isUndefConst(Value *value, uint32_t index) {
+      getRealValue(value, index);
+
+      Constant *CPV = dyn_cast<Constant>(value);
+      if(CPV && dyn_cast<ConstantVector>(CPV))
+        CPV = extractConstantElem(CPV, index);
+      return (CPV && (isa<UndefValue>(CPV)));
+    }
+  private:
+    /*! This creates a scalar register for a Value (index is the vector index when
+     *  the value is a vector of scalars)
+     */
+    ir::Register _newScalar(Value *value, Value *key, Type *type, uint32_t index, bool uniform) {
+      const ir::RegisterFamily family = getFamily(ctx, type);
+      const ir::Register reg = ctx.reg(family, uniform);
+      key = key == NULL ? value : key;
+      this->insertRegister(reg, key, index);
+      return reg;
+    }
+    /*! Map value to ir::Register */
+    map<ValueIndex, ir::Register> scalarMap;
+    /*! Map values to values when this is only a translation (eq bitcast) */
+    map<ValueIndex, ValueIndex> valueMap;
+    /*! Actually allocates the registers */
+    ir::Context &ctx;
+  };
+
+  /*! Translate LLVM IR code to Gen IR code */
+  class GenWriter : public FunctionPass, public InstVisitor<GenWriter>
+  {
+    /*! Unit to compute */
+    ir::Unit &unit;
+    /*! Helper structure to compute the unit */
+    ir::Context ctx;
+    /*! Make the LLVM-to-Gen translation */
+    RegisterTranslator regTranslator;
+    /*! Map target basic block to its ir::LabelIndex */
+    map<const BasicBlock*, ir::LabelIndex> labelMap;
+    /*! Condition inversion can simplify branch code. We store here all the
+     *  compare instructions we need to invert to decrease branch complexity
+     */
+    set<const Value*> conditionSet;
+    map<const Value*, int> globalPointer;
+    /*!
+     *  <phi,phiCopy> node information for later optimization
+     */
+    map<const ir::Register, const ir::Register> phiMap;
+    /*! We visit each function twice. Once to allocate the registers and once to
+     *  emit the Gen IR instructions
+     */
+    enum Pass {
+      PASS_EMIT_REGISTERS = 0,
+      PASS_EMIT_INSTRUCTIONS = 1
+    } pass;
+
+    typedef enum {
+      CONST_INT,
+      CONST_FLOAT,
+      CONST_DOUBLE
+    } ConstTypeId;
+
+    LoopInfo *LI;
+    const Module *TheModule;
+    int btiBase;
+  public:
+    static char ID;
+    explicit GenWriter(ir::Unit &unit)
+      : FunctionPass(ID),
+        unit(unit),
+        ctx(unit),
+        regTranslator(ctx),
+        LI(0),
+        TheModule(0),
+        btiBase(BTI_RESERVED_NUM)
+    {
+      initializeLoopInfoPass(*PassRegistry::getPassRegistry());
+      pass = PASS_EMIT_REGISTERS;
+    }
+
+    virtual const char *getPassName() const { return "Gen Back-End"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LoopInfo>();
+      AU.setPreservesAll();
+    }
+
+    virtual bool doInitialization(Module &M);
+    /*! helper function for parsing global constant data */
+    void getConstantData(const Constant * c, void* mem, uint32_t& offset) const;
+    void collectGlobalConstant(void) const;
+    ir::ImmediateIndex processConstantImmIndex(Constant *CPV, int32_t index = 0u);
+    const ir::Immediate &processConstantImm(Constant *CPV, int32_t index = 0u);
+
+    uint32_t incBtiBase() {
+      GBE_ASSERT(btiBase <= BTI_MAX_ID);
+      return btiBase++;
+    }
+
+    bool runOnFunction(Function &F) {
+     // Do not codegen any 'available_externally' functions at all, they have
+     // definitions outside the translation unit.
+     if (F.hasAvailableExternallyLinkage())
+       return false;
+
+      // As we inline all function calls, so skip non-kernel functions
+      bool bKernel = isKernelFunction(F);
+      if(!bKernel) return false;
+
+      LI = &getAnalysis<LoopInfo>();
+      emitFunction(F);
+      phiMap.clear();
+      globalPointer.clear();
+      // Reset for next function
+      btiBase = BTI_RESERVED_NUM;
+      return false;
+    }
+
+    virtual bool doFinalization(Module &M) { return false; }
+    /*! handle global variable register allocation (local, constant space) */
+    void allocateGlobalVariableRegister(Function &F);
+    /*! gather all the loops in the function and add them to ir::Function */
+    void gatherLoopInfo(ir::Function &fn);
+    /*! Emit the complete function code and declaration */
+    void emitFunction(Function &F);
+    /*! Handle input and output function parameters */
+    void emitFunctionPrototype(Function &F);
+    /*! Emit the code for a basic block */
+    void emitBasicBlock(BasicBlock *BB);
+    /*! Each block end may require to emit MOVs for further PHIs */
+    void emitMovForPHI(BasicBlock *curr, BasicBlock *succ);
+    /*! Alocate one or several registers (if vector) for the value */
+    INLINE void newRegister(Value *value, Value *key = NULL, bool uniform = false);
+    /*! get the register for a llvm::Constant */
+    ir::Register getConstantRegister(Constant *c, uint32_t index = 0);
+    /*! get constant pointer */
+    ir::Register getConstantPointerRegister(ConstantExpr *ce, uint32_t index = 0);
+    /*! Return a valid register from an operand (can use LOADI to make one) */
+    INLINE ir::Register getRegister(Value *value, uint32_t index = 0);
+    /*! Create a new immediate from a constant */
+    ir::ImmediateIndex newImmediate(Constant *CPV, uint32_t index = 0);
+    /*! Insert a new label index when this is a scalar value */
+    INLINE void newLabelIndex(const BasicBlock *bb);
+    /*! Inspect the terminator instruction and try to see if we should invert
+     *  the value to simplify the code
+     */
+    INLINE void simplifyTerminator(BasicBlock *bb);
+    /*! Helper function to emit loads and stores */
+    template <bool isLoad, typename T> void emitLoadOrStore(T &I);
+    /*! Will try to remove MOVs due to PHI resolution */
+    void removeMOVs(const ir::Liveness &liveness, ir::Function &fn);
+    /*! Optimize phi move based on liveness information */
+    void optimizePhiCopy(ir::Liveness &liveness, ir::Function &fn);
+    /*! Will try to remove redundants LOADI in basic blocks */
+    void removeLOADIs(const ir::Liveness &liveness, ir::Function &fn);
+    /*! To avoid lost copy, we need two values for PHI. This function create a
+     * fake value for the copy (basically ptr+1)
+     */
+    INLINE Value *getPHICopy(Value *PHI);
+    // Currently supported instructions
+#define DECL_VISIT_FN(NAME, TYPE) \
+    void regAllocate##NAME(TYPE &I); \
+    void emit##NAME(TYPE &I); \
+    void visit##NAME(TYPE &I) { \
+      if (pass == PASS_EMIT_INSTRUCTIONS) \
+        emit##NAME(I); \
+      else \
+        regAllocate##NAME(I); \
+    }
+    DECL_VISIT_FN(BinaryOperator, Instruction);
+    DECL_VISIT_FN(CastInst, CastInst);
+    DECL_VISIT_FN(ReturnInst, ReturnInst);
+    DECL_VISIT_FN(LoadInst, LoadInst);
+    DECL_VISIT_FN(StoreInst, StoreInst);
+    DECL_VISIT_FN(CallInst, CallInst);
+    DECL_VISIT_FN(ICmpInst, ICmpInst);
+    DECL_VISIT_FN(FCmpInst, FCmpInst);
+    DECL_VISIT_FN(InsertElement, InsertElementInst);
+    DECL_VISIT_FN(ExtractElement, ExtractElementInst);
+    DECL_VISIT_FN(ShuffleVectorInst, ShuffleVectorInst);
+    DECL_VISIT_FN(SelectInst, SelectInst);
+    DECL_VISIT_FN(BranchInst, BranchInst);
+    DECL_VISIT_FN(PHINode, PHINode);
+    DECL_VISIT_FN(AllocaInst, AllocaInst);
+#undef DECL_VISIT_FN
+
+    // Emit unary instructions from gen native function
+    void emitUnaryCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode);
+    // Emit unary instructions from gen native function
+    void emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode);
+
+    uint8_t appendSampler(CallSite::arg_iterator AI);
+
+    // These instructions are not supported at all
+    void visitVAArgInst(VAArgInst &I) {NOT_SUPPORTED;}
+    void visitSwitchInst(SwitchInst &I) {NOT_SUPPORTED;}
+    void visitInvokeInst(InvokeInst &I) {NOT_SUPPORTED;}
+#if LLVM_VERSION_MINOR == 0
+    void visitUnwindInst(UnwindInst &I) {NOT_SUPPORTED;}
+#endif /* __LLVM_30__ */
+    void visitResumeInst(ResumeInst &I) {NOT_SUPPORTED;}
+    void visitInlineAsm(CallInst &I) {NOT_SUPPORTED;}
+    void visitIndirectBrInst(IndirectBrInst &I) {NOT_SUPPORTED;}
+    void visitUnreachableInst(UnreachableInst &I) {NOT_SUPPORTED;}
+    void visitGetElementPtrInst(GetElementPtrInst &I) {NOT_SUPPORTED;}
+    void visitInsertValueInst(InsertValueInst &I) {NOT_SUPPORTED;}
+    void visitExtractValueInst(ExtractValueInst &I) {NOT_SUPPORTED;}
+    template <bool isLoad, typename T> void visitLoadOrStore(T &I);
+
+    INLINE void gatherBTI(Value *pointer, ir::BTI &bti);
+    // batch vec4/8/16 load/store
+    INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
+                  Value *llvmValue, const ir::Register ptr,
+                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti);
+    void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
+    private:
+      ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
+      template <typename T, typename P = T>
+      ir::ImmediateIndex processSeqConstant(ConstantDataSequential *seq,
+                                            int index, ConstTypeId tid);
+      ir::ImmediateIndex processConstantVector(ConstantVector *cv, int index);
+  };
+
+  char GenWriter::ID = 0;
+  void getSequentialData(const ConstantDataSequential *cda, void *ptr, uint32_t &offset) {
+    StringRef data = cda->getRawDataValues();
+    memcpy((char*)ptr+offset, data.data(), data.size());
+    offset += data.size();
+    return;
+  }
+
+  void GenWriter::getConstantData(const Constant * c, void* mem, uint32_t& offset) const {
+    Type * type = c->getType();
+    Type::TypeID id = type->getTypeID();
+
+    GBE_ASSERT(c);
+    if(isa<UndefValue>(c)) {
+      uint32_t size = getTypeByteSize(unit, type);
+      offset += size;
+      return;
+    } else if(isa<ConstantAggregateZero>(c)) {
+      uint32_t size = getTypeByteSize(unit, type);
+      memset((char*)mem+offset, 0, size);
+      offset += size;
+      return;
+    }
+
+    switch(id) {
+      case Type::TypeID::StructTyID:
+        {
+          const StructType * strTy = cast<StructType>(c->getType());
+          uint32_t size = 0;
+
+          for(uint32_t op=0; op < strTy->getNumElements(); op++)
+          {
+            Type* elementType = strTy->getElementType(op);
+            uint32_t align = 8 * getAlignmentByte(unit, elementType);
+            uint32_t padding = getPadding(size, align);
+            size += padding;
+            size += getTypeBitSize(unit, elementType);
+
+            offset += padding/8;
+            const Constant* sub = cast<Constant>(c->getOperand(op));
+            GBE_ASSERT(sub);
+            getConstantData(sub, mem, offset);
+          }
+          break;
+        }
+      case Type::TypeID::ArrayTyID:
+        {
+          const ConstantDataSequential *cds = dyn_cast<ConstantDataSequential>(c);
+          if(cds)
+            getSequentialData(cds, mem, offset);
+          else {
+            const ConstantArray *ca = dyn_cast<ConstantArray>(c);
+            const ArrayType *arrTy = ca->getType();
+            Type* elemTy = arrTy->getElementType();
+            uint32_t elemSize = getTypeBitSize(unit, elemTy);
+            uint32_t padding = getPadding(elemSize, 8 * getAlignmentByte(unit, elemTy));
+            padding /= 8;
+            uint32_t ops = c->getNumOperands();
+            for(uint32_t op = 0; op < ops; ++op) {
+              Constant * ca = dyn_cast<Constant>(c->getOperand(op));
+              getConstantData(ca, mem, offset);
+              offset += padding;
+            }
+          }
+          break;
+        }
+      case Type::TypeID::VectorTyID:
+        {
+          const ConstantDataSequential *cds = dyn_cast<ConstantDataSequential>(c);
+          const VectorType *vecTy = cast<VectorType>(type);
+          GBE_ASSERT(cds);
+          getSequentialData(cds, mem, offset);
+          if(vecTy->getNumElements() == 3) // OCL spec require align to vec4
+            offset += getTypeByteSize(unit, vecTy->getElementType());
+          break;
+        }
+      case Type::TypeID::IntegerTyID:
+        {
+          const ConstantInt *ci = dyn_cast<ConstantInt>(c);
+          uint32_t size = ci->getBitWidth() / 8;
+          uint64_t data = ci->isNegative() ? ci->getSExtValue() : ci->getZExtValue();
+          memcpy((char*)mem+offset, &data, size);
+          offset += size;
+          break;
+        }
+      case Type::TypeID::FloatTyID:
+        {
+          const ConstantFP *cf = dyn_cast<ConstantFP>(c);
+          *(float *)((char*)mem + offset) = cf->getValueAPF().convertToFloat();
+          offset += sizeof(float);
+          break;
+        }
+      case Type::TypeID::DoubleTyID:
+        {
+          const ConstantFP *cf = dyn_cast<ConstantFP>(c);
+          *(double *)((char*)mem + offset) = cf->getValueAPF().convertToDouble();
+          offset += sizeof(double);
+          break;
+        }
+      default:
+        NOT_IMPLEMENTED;
+    }
+  }
+
+  void GenWriter::collectGlobalConstant(void) const {
+    const Module::GlobalListType &globalList = TheModule->getGlobalList();
+    for(auto i = globalList.begin(); i != globalList.end(); i ++) {
+      const GlobalVariable &v = *i;
+      if(!v.isConstantUsed()) continue;
+      const char *name = v.getName().data();
+      unsigned addrSpace = v.getType()->getAddressSpace();
+      if(addrSpace == ir::AddressSpace::MEM_CONSTANT) {
+        GBE_ASSERT(v.hasInitializer());
+        const Constant *c = v.getInitializer();
+        Type * type = c->getType();
+
+        uint32_t size = getTypeByteSize(unit, type);
+        void* mem = malloc(size);
+        uint32_t offset = 0;
+        getConstantData(c, mem, offset);
+        uint32_t alignment = getAlignmentByte(unit, type);
+        unit.newConstant((char *)mem, name, size, alignment);
+        free(mem);
+      }
+    }
+  }
+
+  bool GenWriter::doInitialization(Module &M) {
+    FunctionPass::doInitialization(M);
+
+    // Initialize
+    TheModule = &M;
+    collectGlobalConstant();
+    return false;
+  }
+
+  #define GET_EFFECT_DATA(_seq, _index, _tid) \
+    ((_tid == CONST_INT) ? _seq->getElementAsInteger(_index) : \
+    ((_tid == CONST_FLOAT) ? _seq->getElementAsFloat(_index) : \
+    _seq->getElementAsDouble(_index)))
+
+  // typename P is for bool only, as c++ set the &vector<bool)vec[0] to void
+  // type. We have to use uint8_t for bool vector.
+  template <typename T, typename P>
+  ir::ImmediateIndex GenWriter::processSeqConstant(ConstantDataSequential *seq,
+                                                   int index, ConstTypeId tid) {
+    if (index >= 0) {
+      const T data = GET_EFFECT_DATA(seq, index, tid);
+      return ctx.newImmediate(data);
+    } else {
+      vector<P> array;
+      for(int i = 0; i < seq->getNumElements(); i++)
+        array.push_back(GET_EFFECT_DATA(seq, i, tid));
+      return ctx.newImmediate((T*)&array[0], array.size());
+    }
+  }
+
+  ir::ImmediateIndex GenWriter::processConstantVector(ConstantVector *cv, int index) {
+    if (index >= 0) {
+      Constant *c = cv->getOperand(index);
+      return processConstantImmIndex(c, -1);
+    } else {
+      vector<ir::ImmediateIndex> immVector;
+      for (uint32_t i = 0; i < cv->getNumOperands(); i++)
+        immVector.push_back(processConstantImmIndex(cv->getOperand(i)));
+      return ctx.newImmediate(immVector);
+    }
+  }
+
+  ir::ImmediateIndex GenWriter::processConstantImmIndexImpl(Constant *CPV, int32_t index)
+  {
+    GBE_ASSERT(dyn_cast<ConstantExpr>(CPV) == NULL);
+
+#if LLVM_VERSION_MINOR > 0
+    ConstantDataSequential *seq = dyn_cast<ConstantDataSequential>(CPV);
+
+    if (seq) {
+      Type *Ty = seq->getElementType();
+      if (Ty == Type::getInt1Ty(CPV->getContext())) {
+        return processSeqConstant<bool, uint8_t>(seq, index, CONST_INT);
+      } else if (Ty == Type::getInt8Ty(CPV->getContext())) {
+        return processSeqConstant<uint8_t>(seq, index, CONST_INT);
+      } else if (Ty == Type::getInt16Ty(CPV->getContext())) {
+        return processSeqConstant<uint16_t>(seq, index, CONST_INT);
+      } else if (Ty == Type::getInt32Ty(CPV->getContext())) {
+        return processSeqConstant<uint32_t>(seq, index, CONST_INT);
+      } else if (Ty == Type::getInt64Ty(CPV->getContext())) {
+        return processSeqConstant<uint64_t>(seq, index, CONST_INT);
+      } else if (Ty == Type::getFloatTy(CPV->getContext())) {
+        return processSeqConstant<float>(seq, index, CONST_FLOAT);
+      } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
+        return processSeqConstant<double>(seq, index, CONST_DOUBLE);
+      }
+    } else
+#endif /* LLVM_VERSION_MINOR > 0 */
+
+    if (dyn_cast<ConstantAggregateZero>(CPV)) {
+      Type* Ty = CPV->getType();
+      if(Ty->isVectorTy())
+        Ty = (cast<VectorType>(Ty))->getElementType();
+      if (Ty == Type::getInt1Ty(CPV->getContext())) {
+        const bool b = 0;
+        return ctx.newImmediate(b);
+      } else if (Ty == Type::getInt8Ty(CPV->getContext())) {
+        const uint8_t u8 = 0;
+        return ctx.newImmediate(u8);
+      } else if (Ty == Type::getInt16Ty(CPV->getContext())) {
+        const uint16_t u16 = 0;
+        return ctx.newImmediate(u16);
+      } else if (Ty == Type::getInt32Ty(CPV->getContext())) {
+        const uint32_t u32 = 0;
+        return ctx.newImmediate(u32);
+      } else if (Ty == Type::getInt64Ty(CPV->getContext())) {
+        const uint64_t u64 = 0;
+        return ctx.newImmediate(u64);
+      } else if (Ty == Type::getFloatTy(CPV->getContext())) {
+        const float f32 = 0;
+        return ctx.newImmediate(f32);
+      } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
+        const double f64 = 0;
+        return ctx.newImmediate(f64);
+      } else {
+        GBE_ASSERTM(false, "Unsupporte aggregate zero type.");
+        return ctx.newImmediate(uint32_t(0));
+      }
+    } else {
+      if (dyn_cast<ConstantVector>(CPV))
+        return processConstantVector(dyn_cast<ConstantVector>(CPV), index);
+      GBE_ASSERTM(dyn_cast<ConstantExpr>(CPV) == NULL, "Unsupported constant expression");
+
+      // Integers
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+        Type* Ty = CI->getType();
+        if (Ty == Type::getInt1Ty(CPV->getContext())) {
+          const bool b = CI->getZExtValue();
+          return ctx.newImmediate(b);
+        } else if (Ty == Type::getInt8Ty(CPV->getContext())) {
+          const uint8_t u8 = CI->getZExtValue();
+          return ctx.newImmediate(u8);
+        } else if (Ty == Type::getInt16Ty(CPV->getContext())) {
+          const uint16_t u16 = CI->getZExtValue();
+          return ctx.newImmediate(u16);
+        } else if (Ty == Type::getInt32Ty(CPV->getContext())) {
+          const uint32_t u32 = CI->getZExtValue();
+          return ctx.newImmediate(u32);
+        } else if (Ty == Type::getInt64Ty(CPV->getContext())) {
+          const uint64_t u64 = CI->getZExtValue();
+          return ctx.newImmediate(u64);
+        } else {
+          if (CI->getValue().getActiveBits() > 64) {
+            ctx.getUnit().setValid(false);
+            return ctx.newImmediate(uint64_t(0));
+          }
+          return ctx.newImmediate(uint64_t(CI->getZExtValue()));
+        }
+      }
+
+      // NULL pointers
+      if(isa<ConstantPointerNull>(CPV)) {
+        return ctx.newImmediate(uint32_t(0));
+      }
+
+      const Type::TypeID typeID = CPV->getType()->getTypeID();
+      if (isa<UndefValue>(CPV)) {
+        Type* Ty = CPV->getType();
+        if (Ty == Type::getInt1Ty(CPV->getContext())) return ctx.newImmediate(false);
+        if (Ty == Type::getInt8Ty(CPV->getContext())) return ctx.newImmediate((uint8_t)0);
+        if (Ty == Type::getInt16Ty(CPV->getContext())) return ctx.newImmediate((uint16_t)0);
+        if (Ty == Type::getInt32Ty(CPV->getContext())) return ctx.newImmediate((uint32_t)0);
+        if (Ty == Type::getInt64Ty(CPV->getContext())) return ctx.newImmediate((uint64_t)0);
+        if (Ty == Type::getFloatTy(CPV->getContext())) return ctx.newImmediate((float)0);
+        if (Ty == Type::getDoubleTy(CPV->getContext())) return ctx.newImmediate((double)0);
+        GBE_ASSERT(0 && "Unsupported undef value type.\n");
+      }
+
+      // Floats and doubles
+      switch (typeID) {
+        case Type::FloatTyID:
+        case Type::DoubleTyID:
+        {
+          ConstantFP *FPC = cast<ConstantFP>(CPV);
+          GBE_ASSERT(isa<UndefValue>(CPV) == false);
+
+          if (FPC->getType() == Type::getFloatTy(CPV->getContext())) {
+            const float f32 = FPC->getValueAPF().convertToFloat();
+            return ctx.newImmediate(f32);
+          } else {
+            const double f64 = FPC->getValueAPF().convertToDouble();
+            return ctx.newImmediate(f64);
+          }
+        }
+        break;
+        default:
+          GBE_ASSERTM(false, "Unsupported constant type");
+          break;
+      }
+    }
+
+    GBE_ASSERTM(false, "Unsupported constant type");
+    return ctx.newImmediate(uint64_t(0));
+  }
+
+  ir::ImmediateIndex GenWriter::processConstantImmIndex(Constant *CPV, int32_t index) {
+    if (dyn_cast<ConstantExpr>(CPV) == NULL)
+      return processConstantImmIndexImpl(CPV, index);
+
+    if (dyn_cast<ConstantExpr>(CPV)) {
+      ConstantExpr *ce = dyn_cast<ConstantExpr>(CPV);
+      ir::Type type = getType(ctx, ce->getType());
+      switch (ce->getOpcode()) {
+        default:
+          //ce->dump();
+          GBE_ASSERT(0 && "unsupported ce opcode.\n");
+        case Instruction::Trunc:
+        {
+          const ir::ImmediateIndex immIndex = processConstantImmIndex(ce->getOperand(0), -1);
+          return ctx.processImm(ir::IMM_TRUNC, immIndex, type);
+        }
+        case Instruction::BitCast:
+        {
+          const ir::ImmediateIndex immIndex = processConstantImmIndex(ce->getOperand(0), -1);
+          if (type == ir::TYPE_LARGE_INT)
+            return immIndex;
+          return ctx.processImm(ir::IMM_BITCAST, immIndex, type);
+        }
+        case Instruction::Add:
+        case Instruction::Sub:
+        case Instruction::Mul:
+        case Instruction::SDiv:
+        case Instruction::SRem:
+        case Instruction::Shl:
+        case Instruction::AShr:
+        case Instruction::LShr:
+        case Instruction::And:
+        case Instruction::Or:
+        case Instruction::Xor: {
+          const ir::ImmediateIndex lhs  = processConstantImmIndex(ce->getOperand(0), -1);
+          const ir::ImmediateIndex rhs  = processConstantImmIndex(ce->getOperand(1), -1);
+          switch (ce->getOpcode()) {
+          default:
+            //ce->dump();
+            GBE_ASSERTM(0, "Unsupported constant expression.\n");
+          case Instruction::Add:
+            return ctx.processImm(ir::IMM_ADD, lhs, rhs, type);
+          case Instruction::Sub:
+            return ctx.processImm(ir::IMM_SUB, lhs, rhs, type);
+          case Instruction::Mul:
+            return ctx.processImm(ir::IMM_MUL, lhs, rhs, type);
+          case Instruction::SDiv:
+            return ctx.processImm(ir::IMM_DIV, lhs, rhs, type);
+          case Instruction::SRem:
+            return ctx.processImm(ir::IMM_REM, lhs, rhs, type);
+          case Instruction::Shl:
+            return ctx.processImm(ir::IMM_SHL, lhs, rhs, type);
+          case Instruction::AShr:
+            return ctx.processImm(ir::IMM_ASHR, lhs, rhs, type);
+          case Instruction::LShr:
+            return ctx.processImm(ir::IMM_LSHR, lhs, rhs, type);
+          case Instruction::And:
+            return ctx.processImm(ir::IMM_AND, lhs, rhs, type);
+          case Instruction::Or:
+            return ctx.processImm(ir::IMM_OR, lhs, rhs, type);
+          case Instruction::Xor:
+            return ctx.processImm(ir::IMM_XOR, lhs, rhs, type);
+          }
+        }
+      }
+    }
+    GBE_ASSERT(0 && "unsupported constant.\n");
+    return ctx.newImmediate((uint32_t)0);
+  }
+
+  const ir::Immediate &GenWriter::processConstantImm(Constant *CPV, int32_t index) {
+    ir::ImmediateIndex immIndex = processConstantImmIndex(CPV, index);
+    return ctx.getFunction().getImmediate(immIndex);
+  }
+
+  ir::ImmediateIndex GenWriter::newImmediate(Constant *CPV, uint32_t index) {
+    return processConstantImmIndex(CPV, index);
+  }
+
+  void GenWriter::newRegister(Value *value, Value *key, bool uniform) {
+    auto type = value->getType();
+    auto typeID = type->getTypeID();
+    switch (typeID) {
+      case Type::IntegerTyID:
+      case Type::FloatTyID:
+      case Type::DoubleTyID:
+      case Type::PointerTyID:
+        regTranslator.newScalar(value, key, 0, uniform);
+        break;
+      case Type::VectorTyID:
+      {
+        auto vectorType = cast<VectorType>(type);
+        const uint32_t elemNum = vectorType->getNumElements();
+        for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+          regTranslator.newScalar(value, key, elemID, uniform);
+        break;
+      }
+      default: NOT_SUPPORTED;
+    };
+  }
+
+  ir::Register GenWriter::getConstantPointerRegister(ConstantExpr *expr, uint32_t elemID) {
+    Value* val = expr->getOperand(0);
+
+    if (expr->isCast()) {
+      ir::Register pointer_reg;
+      if(isa<ConstantExpr>(val)) {
+        // try to get the real pointer register, for case like:
+        // store i64 ptrtoint (i8 addrspace(3)* getelementptr inbounds ...
+        // in which ptrtoint and getelementptr are ConstantExpr.
+        pointer_reg = getConstantPointerRegister(dyn_cast<ConstantExpr>(val), elemID);
+      } else {
+        pointer_reg = regTranslator.getScalar(val, elemID);
+      }
+      // if ptrToInt request another type other than 32bit, convert as requested
+      ir::Type dstType = getType(ctx, expr->getType());
+      ir::Type srcType = getType(ctx, val->getType());
+      if(srcType != dstType && dstType != ir::TYPE_S32) {
+        ir::Register tmp = ctx.reg(getFamily(dstType));
+        ctx.CVT(dstType, srcType, tmp, pointer_reg);
+        return tmp;
+      }
+      return pointer_reg;
+    }
+    else if (expr->getOpcode() == Instruction::GetElementPtr) {
+      uint32_t TypeIndex;
+      uint32_t constantOffset = 0;
+
+      Value *pointer = val;
+      CompositeType* CompTy = cast<CompositeType>(pointer->getType());
+      for(uint32_t op=1; op<expr->getNumOperands(); ++op) {
+        uint32_t offset = 0;
+        ConstantInt* ConstOP = dyn_cast<ConstantInt>(expr->getOperand(op));
+        GBE_ASSERT(ConstOP);
+        TypeIndex = ConstOP->getZExtValue();
+        if (op == 1) {
+          if (TypeIndex != 0) {
+            Type *elementType = (cast<PointerType>(pointer->getType()))->getElementType();
+            uint32_t elementSize = getTypeByteSize(unit, elementType);
+            uint32_t align = getAlignmentByte(unit, elementType);
+            elementSize += getPadding(elementSize, align);
+            offset += elementSize * TypeIndex;
+          }
+        } else {
+          for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
+          {
+            Type* elementType = CompTy->getTypeAtIndex(ty_i);
+            uint32_t align = getAlignmentByte(unit, elementType);
+            offset += getPadding(offset, align);
+            offset += getTypeByteSize(unit, elementType);
+          }
+          const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
+          offset += getPadding(offset, align);
+        }
+
+        constantOffset += offset;
+        CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
+      }
+
+      ir::Register pointer_reg;
+      if(isa<ConstantExpr>(pointer))
+        pointer_reg = getConstantPointerRegister(dyn_cast<ConstantExpr>(pointer), elemID);
+      else
+        pointer_reg = regTranslator.getScalar(pointer, elemID);
+
+      ir::Register offset_reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
+      ctx.LOADI(ir::Type::TYPE_S32, offset_reg, ctx.newIntegerImmediate(constantOffset, ir::Type::TYPE_S32));
+      ir::Register reg = ctx.reg(ir::RegisterFamily::FAMILY_DWORD);
+      ctx.ADD(ir::Type::TYPE_S32, reg, pointer_reg, offset_reg);
+      return reg;
+    }
+    else
+      assert(0);
+  }
+
+  ir::Register GenWriter::getConstantRegister(Constant *c, uint32_t elemID) {
+    GBE_ASSERT(c != NULL);
+    if(isa<GlobalValue>(c)) {
+      return regTranslator.getScalar(c, elemID);
+    }
+    if(isa<UndefValue>(c)) {
+      Type* llvmType = c->getType();
+      ir::Type dstType = getType(ctx, llvmType);
+      ir::Register reg = ctx.reg(getFamily(dstType));
+
+      ir::ImmediateIndex immIndex;
+      if(llvmType->isIntegerTy())
+        immIndex = ctx.newIntegerImmediate(0, dstType);
+      else if(llvmType->isFloatTy()) {
+        immIndex = ctx.newFloatImmediate((float)0.0);
+      } else {
+        immIndex = ctx.newDoubleImmediate((double)0.0);
+      }
+      ctx.LOADI(dstType, reg, immIndex);
+      return reg;
+    }
+
+    if(isa<ConstantExpr>(c)) {
+      // Check whether this is a constant drived from a pointer.
+      Constant *itC = c;
+      while(isa<ConstantExpr>(itC))
+        itC = dyn_cast<ConstantExpr>(itC)->getOperand(0);
+      if (itC->getType()->isPointerTy())
+        return getConstantPointerRegister(dyn_cast<ConstantExpr>(c), elemID);
+    }
+
+    const ir::ImmediateIndex immIndex = this->newImmediate(c, elemID);
+    const ir::Immediate imm = ctx.getImmediate(immIndex);
+    const ir::Register reg = ctx.reg(getFamily(imm.getType()));
+    ctx.LOADI(imm.getType(), reg, immIndex);
+    return reg;
+  }
+
+  ir::Register GenWriter::getRegister(Value *value, uint32_t elemID) {
+    //the real value may be constant, so get real value before constant check
+    regTranslator.getRealValue(value, elemID);
+    if(isa<Constant>(value)) {
+      Constant *c = dyn_cast<Constant>(value);
+      return getConstantRegister(c, elemID);
+    } else
+      return regTranslator.getScalar(value, elemID);
+  }
+
+  INLINE Value *GenWriter::getPHICopy(Value *PHI) {
+    const uintptr_t ptr = (uintptr_t) PHI;
+    return (Value*) (ptr+1);
+  }
+
+  void GenWriter::newLabelIndex(const BasicBlock *bb) {
+    if (labelMap.find(bb) == labelMap.end()) {
+      const ir::LabelIndex label = ctx.label();
+      labelMap[bb] = label;
+    }
+  }
+
+  void GenWriter::simplifyTerminator(BasicBlock *bb) {
+    Value *value = --bb->end();
+    BranchInst *I = NULL;
+    if ((I = dyn_cast<BranchInst>(value)) != NULL) {
+      if (I->isConditional() == false)
+        return;
+      // If the "taken" successor is the next block, we try to invert the
+      // branch.
+      BasicBlock *succ = I->getSuccessor(0);
+      if (std::next(Function::iterator(bb)) != Function::iterator(succ))
+        return;
+
+      // More than one use is too complicated: we skip it
+      Value *condition = I->getCondition();
+      if (condition->hasOneUse() == false)
+        return;
+
+      // Right now, we only invert comparison instruction
+      ICmpInst *CI = dyn_cast<ICmpInst>(condition);
+      if (CI != NULL) {
+        GBE_ASSERT(conditionSet.find(CI) == conditionSet.end());
+        conditionSet.insert(CI);
+        return;
+      }
+    }
+  }
+
+  void GenWriter::emitBasicBlock(BasicBlock *BB) {
+    GBE_ASSERT(labelMap.find(BB) != labelMap.end());
+    ctx.LABEL(labelMap[BB]);
+    for (auto II = BB->begin(), E = BB->end(); II != E; ++II) visit(*II);
+  }
+
+  void GenWriter::emitMovForPHI(BasicBlock *curr, BasicBlock *succ) {
+    for (BasicBlock::iterator I = succ->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PN = cast<PHINode>(I);
+      Value *IV = PN->getIncomingValueForBlock(curr);
+      Type *llvmType = PN->getType();
+      const ir::Type type = getType(ctx, llvmType);
+      Value *PHICopy = this->getPHICopy(PN);
+      const ir::Register dst = this->getRegister(PHICopy);
+      if (!isa<UndefValue>(IV)) {
+
+        // Emit the MOV required by the PHI function. We do it simple and do not
+        // try to optimize them. A next data flow analysis pass on the Gen IR
+        // will remove them
+        Constant *CP = dyn_cast<Constant>(IV);
+        if (CP) {
+          GBE_ASSERT(isa<GlobalValue>(CP) == false);
+          ConstantVector *CPV = dyn_cast<ConstantVector>(CP);
+          if (CPV && dyn_cast<ConstantVector>(CPV) &&
+              isa<UndefValue>(extractConstantElem(CPV, 0)))
+            continue;
+          ctx.MOV(type, dst, getRegister(CP));
+        } else if (regTranslator.valueExists(IV,0) || dyn_cast<Constant>(IV)) {
+          const ir::Register src = this->getRegister(IV);
+          ctx.MOV(type, dst, src);
+        }
+        assert(!ctx.getBlock()->undefPhiRegs.contains(dst));
+        ctx.getBlock()->definedPhiRegs.insert(dst);
+      } else {
+        // If this is an undefined value, we don't need emit phi copy here.
+        // But we need to record it. As latter, at liveness's backward analysis,
+        // we don't need to pass the phi value/register to this BB which the phi
+        // value is undefined. Otherwise, the phi value's liveness will be extent
+        // incorrectly and may be extent to the basic block zero which is really bad.
+        ctx.getBlock()->undefPhiRegs.insert(dst);
+      }
+    }
+  }
+
+  void GenWriter::emitFunctionPrototype(Function &F)
+  {
+    GBE_ASSERTM(F.hasStructRetAttr() == false,
+                "Returned value for kernel functions is forbidden");
+
+    // Loop over the kernel metadatas to set the required work group size.
+    NamedMDNode *clKernelMetaDatas = TheModule->getNamedMetadata("opencl.kernels");
+    size_t reqd_wg_sz[3] = {0, 0, 0};
+    size_t hint_wg_sz[3] = {0, 0, 0};
+    ir::FunctionArgument::InfoFromLLVM llvmInfo;
+    MDNode *node = NULL;
+    MDNode *addrSpaceNode = NULL;
+    MDNode *typeNameNode = NULL;
+    MDNode *accessQualNode = NULL;
+    MDNode *typeQualNode = NULL;
+    MDNode *argNameNode = NULL;
+
+    std::string functionAttributes;
+
+    /* First find the meta data belong to this function. */
+    for(uint i = 0; i < clKernelMetaDatas->getNumOperands(); i++) {
+      node = clKernelMetaDatas->getOperand(i);
+      if (node->getOperand(0) == &F) break;
+      node = NULL;
+    }
+
+    /* because "-cl-kernel-arg-info", should always have meta data. */
+    if (!F.arg_empty())
+      assert(node);
+
+
+    for(uint j = 0; j < node->getNumOperands() - 1; j++) {
+      MDNode *attrNode = dyn_cast_or_null<MDNode>(node->getOperand(1 + j));
+      if (attrNode == NULL) break;
+      MDString *attrName = dyn_cast_or_null<MDString>(attrNode->getOperand(0));
+      if (!attrName) continue;
+
+      if (attrName->getString() == "reqd_work_group_size") {
+        GBE_ASSERT(attrNode->getNumOperands() == 4);
+        ConstantInt *x = dyn_cast<ConstantInt>(attrNode->getOperand(1));
+        ConstantInt *y = dyn_cast<ConstantInt>(attrNode->getOperand(2));
+        ConstantInt *z = dyn_cast<ConstantInt>(attrNode->getOperand(3));
+        GBE_ASSERT(x && y && z);
+        reqd_wg_sz[0] = x->getZExtValue();
+        reqd_wg_sz[1] = y->getZExtValue();
+        reqd_wg_sz[2] = z->getZExtValue();
+        functionAttributes += attrName->getString();
+        std::stringstream param;
+        char buffer[100];
+        param <<"(";
+        param << reqd_wg_sz[0];
+        param << ",";
+        param << reqd_wg_sz[1];
+        param << ",";
+        param << reqd_wg_sz[2];
+        param <<")";
+        param >> buffer;
+        functionAttributes += buffer;
+        functionAttributes += " ";
+        break;
+      } else if (attrName->getString() == "kernel_arg_addr_space") {
+        addrSpaceNode = attrNode;
+      } else if (attrName->getString() == "kernel_arg_access_qual") {
+        accessQualNode = attrNode;
+      } else if (attrName->getString() == "kernel_arg_type") {
+        typeNameNode = attrNode;
+      } else if (attrName->getString() == "kernel_arg_type_qual") {
+        typeQualNode = attrNode;
+      } else if (attrName->getString() == "kernel_arg_name") {
+        argNameNode = attrNode;
+      } else if (attrName->getString() == "vec_type_hint") {
+        GBE_ASSERT(attrNode->getNumOperands() == 3);
+        functionAttributes += attrName->getString();
+        functionAttributes += " ";
+      } else if (attrName->getString() == "work_group_size_hint") {
+        GBE_ASSERT(attrNode->getNumOperands() == 4);
+        ConstantInt *x = dyn_cast<ConstantInt>(attrNode->getOperand(1));
+        ConstantInt *y = dyn_cast<ConstantInt>(attrNode->getOperand(2));
+        ConstantInt *z = dyn_cast<ConstantInt>(attrNode->getOperand(3));
+        GBE_ASSERT(x && y && z);
+        hint_wg_sz[0] = x->getZExtValue();
+        hint_wg_sz[1] = y->getZExtValue();
+        hint_wg_sz[2] = z->getZExtValue();
+        functionAttributes += attrName->getString();
+        std::stringstream param;
+        char buffer[100];
+        param <<"(";
+        param << hint_wg_sz[0];
+        param << ",";
+        param << hint_wg_sz[1];
+        param << ",";
+        param << hint_wg_sz[2];
+        param <<")";
+        param >> buffer;
+        functionAttributes += buffer;
+        functionAttributes += " ";
+      }
+    }
+    ctx.appendSurface(1, ir::ocl::stackbuffer);
+
+    ctx.getFunction().setCompileWorkGroupSize(reqd_wg_sz[0], reqd_wg_sz[1], reqd_wg_sz[2]);
+
+    ctx.getFunction().setFunctionAttributes(functionAttributes);
+    // Loop over the arguments and output registers for them
+    if (!F.arg_empty()) {
+      uint32_t argID = 0;
+      Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
+
+      // Insert a new register for each function argument
+#if LLVM_VERSION_MINOR <= 1
+      const AttrListPtr &PAL = F.getAttributes();
+#endif /* LLVM_VERSION_MINOR <= 1 */
+      for (; I != E; ++I, ++argID) {
+        const std::string &argName = I->getName().str();
+        Type *type = I->getType();
+
+        llvmInfo.addrSpace = (cast<ConstantInt>(addrSpaceNode->getOperand(1 + argID)))->getZExtValue();
+        llvmInfo.typeName = (cast<MDString>(typeNameNode->getOperand(1 + argID)))->getString();
+        if (llvmInfo.typeName.find("image") != std::string::npos &&
+            llvmInfo.typeName.find("*") != std::string::npos) {
+          uint32_t start = llvmInfo.typeName.find("image");
+          uint32_t end = llvmInfo.typeName.find("*");
+          llvmInfo.typeName = llvmInfo.typeName.substr(start, end - start);
+        }
+        llvmInfo.accessQual = (cast<MDString>(accessQualNode->getOperand(1 + argID)))->getString();
+        llvmInfo.typeQual = (cast<MDString>(typeQualNode->getOperand(1 + argID)))->getString();
+        llvmInfo.argName = (cast<MDString>(argNameNode->getOperand(1 + argID)))->getString();
+
+        // function arguments are uniform values.
+        this->newRegister(I, NULL, true);
+        // add support for vector argument.
+        if(type->isVectorTy()) {
+          VectorType *vectorType = cast<VectorType>(type);
+          ir::Register reg = getRegister(I, 0);
+          Type *elemType = vectorType->getElementType();
+          const uint32_t elemSize = getTypeByteSize(unit, elemType);
+          const uint32_t elemNum = vectorType->getNumElements();
+          //vector's elemType always scalar type
+          ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, elemNum*elemSize, getAlignmentByte(unit, type), 0);
+
+          ir::Function& fn = ctx.getFunction();
+          for(uint32_t i=1; i < elemNum; i++) {
+            ir::PushLocation argLocation(fn, argID, elemSize*i);
+            reg = getRegister(I, i);
+            ctx.appendPushedConstant(reg, argLocation);  //add to push map for reg alloc
+          }
+          continue;
+        }
+
+        GBE_ASSERTM(isScalarType(type) == true,
+                    "vector type in the function argument is not supported yet");
+        const ir::Register reg = getRegister(I);
+        if (type->isPointerTy() == false)
+          ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
+        else {
+          PointerType *pointerType = dyn_cast<PointerType>(type);
+          Type *pointed = pointerType->getElementType();
+          // By value structure
+#if LLVM_VERSION_MINOR <= 1
+          if (PAL.paramHasAttr(argID+1, Attribute::ByVal)) {
+#else
+          if (I->hasByValAttr()) {
+#endif /* LLVM_VERSION_MINOR <= 1 */
+            const size_t structSize = getTypeByteSize(unit, pointed);
+            ctx.input(argName, ir::FunctionArgument::STRUCTURE, reg, llvmInfo, structSize, getAlignmentByte(unit, type), 0);
+          }
+          // Regular user provided pointer (global, local or constant)
+          else {
+            const uint32_t addr = pointerType->getAddressSpace();
+            const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(addr);
+            const uint32_t ptrSize = getTypeByteSize(unit, type);
+            const uint32_t align = getAlignmentByte(unit, pointed);
+              switch (addrSpace) {
+              case ir::MEM_GLOBAL:
+                globalPointer.insert(std::make_pair(I, btiBase));
+                ctx.appendSurface(btiBase, reg);
+                ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, btiBase);
+                incBtiBase();
+              break;
+              case ir::MEM_LOCAL:
+                ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg,  llvmInfo, ptrSize, align, 0xfe);
+                ctx.getFunction().setUseSLM(true);
+              break;
+              case ir::MEM_CONSTANT:
+                ctx.input(argName, ir::FunctionArgument::CONSTANT_POINTER, reg,  llvmInfo, ptrSize, align, 0x2);
+              break;
+              case ir::IMAGE:
+                ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, ptrSize, align, 0x0);
+                ctx.getFunction().getImageSet()->append(reg, &ctx, incBtiBase());
+              break;
+              default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE);
+            }
+          }
+        }
+      }
+    }
+
+    // When returning a structure, first input register is the pointer to the
+    // structure
+#if GBE_DEBUG
+    const Type *type = F.getReturnType();
+    GBE_ASSERTM(type->isVoidTy() == true,
+                "Returned value for kernel functions is forbidden");
+
+    // Variable number of arguments is not supported
+    FunctionType *FT = cast<FunctionType>(F.getFunctionType());
+    GBE_ASSERT(FT->isVarArg() == false);
+#endif /* GBE_DEBUG */
+  }
+
+  static inline bool isFPIntBitCast(const Instruction &I) {
+    if (!isa<BitCastInst>(I))
+      return false;
+    Type *SrcTy = I.getOperand(0)->getType();
+    Type *DstTy = I.getType();
+    return (SrcTy->isFloatingPointTy() && DstTy->isIntegerTy()) ||
+           (DstTy->isFloatingPointTy() && SrcTy->isIntegerTy());
+  }
+
+  /*! To track last read and write of the registers */
+  struct RegInfoForMov {
+    ir::Instruction *lastWriteInsn;
+    ir::Instruction *lastReadInsn;
+    uint32_t lastWrite;
+    uint32_t lastRead;
+  };
+
+  /*! Replace register "from" by register "to" in the destination(s) */
+  static void replaceDst(ir::Instruction *insn, ir::Register from, ir::Register to) {
+    const uint32_t dstNum = insn->getDstNum();
+    for (uint32_t dstID = 0; dstID < dstNum; ++dstID)
+      if (insn->getDst(dstID) == from)
+        insn->setDst(dstID, to);
+  }
+
+  /*! Replace register "from" by register "to" in the source(s) */
+  static void replaceSrc(ir::Instruction *insn, ir::Register from, ir::Register to) {
+    const uint32_t srcNum = insn->getSrcNum();
+    for (uint32_t srcID = 0; srcID < srcNum; ++srcID)
+      if (insn->getSrc(srcID) == from)
+        insn->setSrc(srcID, to);
+  }
+
+  /*! lastUse maintains data about last uses (reads/writes) for each
+   * ir::Register
+   */
+  static void buildRegInfo(ir::BasicBlock &bb, vector<RegInfoForMov> &lastUse)
+  {
+    // Clear the register usages
+    for (auto &x : lastUse) {
+      x.lastWrite = x.lastRead = 0;
+      x.lastWriteInsn = x.lastReadInsn = NULL;
+    }
+
+    // Find use intervals for all registers (distinguish sources and
+    // destinations)
+    uint32_t insnID = 2;
+    bb.foreach([&](ir::Instruction &insn) {
+      const uint32_t dstNum = insn.getDstNum();
+      const uint32_t srcNum = insn.getSrcNum();
+      for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+        const ir::Register reg = insn.getSrc(srcID);
+        lastUse[reg].lastRead = insnID;
+        lastUse[reg].lastReadInsn = &insn;
+      }
+      for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+        const ir::Register reg = insn.getDst(dstID);
+        lastUse[reg].lastWrite = insnID+1;
+        lastUse[reg].lastWriteInsn = &insn;
+      }
+      insnID+=2;
+    });
+  }
+
+  void GenWriter::optimizePhiCopy(ir::Liveness &liveness, ir::Function &fn)
+  {
+    // The overall idea behind is we check whether there is any interference
+    // between phi and phiCopy live range. If there is no point that
+    // phi & phiCopy are both alive, then we can optimize off the move
+    // from phiCopy to phi, and use phiCopy directly instead of phi.
+    using namespace ir;
+    ir::FunctionDAG *dag = new ir::FunctionDAG(liveness);
+
+    for (auto &it : phiMap) {
+      const Register phi = it.first;
+      const Register phiCopy = it.second;
+
+      const ir::DefSet *phiCopyDef = dag->getRegDef(phiCopy);
+      const ir::UseSet *phiUse = dag->getRegUse(phi);
+      const DefSet *phiDef = dag->getRegDef(phi);
+      bool isOpt = true;
+      for (auto &x : *phiCopyDef) {
+        const ir::Instruction * phiCopyDefInsn = x->getInstruction();
+        const ir::BasicBlock *bb = phiCopyDefInsn->getParent();
+        const Liveness::LiveOut &out = liveness.getLiveOut(bb);
+        // phi & phiCopy are both alive at the endpoint of bb,
+        // thus can not be optimized.
+        if (out.contains(phi)) {
+          isOpt = false;
+          break;
+        }
+        // If phi is used in the same BB that define the phiCopy,
+        // we need carefully check the liveness of phi & phiCopy.
+        // Make sure their live ranges do not interfere.
+        bool phiUsedInSameBB = false;
+        for (auto &y : *phiUse) {
+          const ir::Instruction *phiUseInsn = y->getInstruction();
+          const ir::BasicBlock *bb2 = phiUseInsn->getParent();
+          if (bb2 == bb) {
+            phiUsedInSameBB = true;
+          }
+        }
+        // Check phi is not used between phiCopy def point and bb's end point,
+        // which is often referred as 'phi swap issue', just like below:
+        //   MOV phiCopy_1, x;
+        //   MOV phiCopy_2, phi_1;
+        if (phiUsedInSameBB ) {
+          for (auto it = --bb->end(); it != bb->end() ; --it) {
+            const Instruction &p = *it;
+
+            if (&p == phiCopyDefInsn) break;
+            // we only care MOV here
+            if (p.getSrcNum() == 1 && p.getSrc(0) == phi) {
+              isOpt = false;
+              break;
+            }
+          }
+        }
+      }
+
+      // [MOV phi, phiCopy;] can be removed. So we remove it
+      // and replace phi uses with phiCopy
+      if (isOpt) {
+        for (auto &x : *phiDef) {
+          const_cast<Instruction *>(x->getInstruction())->remove();
+        }
+        for (auto &x : *phiUse) {
+          const Instruction *phiUseInsn = x->getInstruction();
+          replaceSrc(const_cast<Instruction *>(phiUseInsn), phi, phiCopy);
+        }
+      }
+    }
+    delete dag;
+  }
+
+  void GenWriter::removeMOVs(const ir::Liveness &liveness, ir::Function &fn)
+  {
+    // We store the last write and last read for each register
+    const uint32_t regNum = fn.regNum();
+    vector<RegInfoForMov> lastUse;
+    lastUse.resize(regNum);
+
+    // Remove the MOVs per block (local analysis only) Note that we do not try
+    // to remove MOV for variables that outlives the block. So we use liveness
+    // information to figure out which variable is alive
+    fn.foreachBlock([&](ir::BasicBlock &bb)
+    {
+      // We need to know when each register will be read or written
+      buildRegInfo(bb, lastUse);
+
+      // Liveinfo helps us to know if the source outlives the block
+      const ir::Liveness::BlockInfo &info = liveness.getBlockInfo(&bb);
+
+      auto it = --bb.end();
+      if (it->isMemberOf<ir::BranchInstruction>() == true) --it;
+      for (auto it = --bb.end(); it != bb.end();) {
+        ir::Instruction *insn = &*it; it--;
+        const ir::Opcode op = insn->getOpcode();
+        if (op == ir::OP_MOV) {
+          const ir::Register dst = insn->getDst(0);
+          const ir::Register src = insn->getSrc(0);
+          // Outlives the block. We do not do anything
+          if (info.inLiveOut(src))
+            continue;
+          const RegInfoForMov &dstInfo = lastUse[dst];
+          const RegInfoForMov &srcInfo = lastUse[src];
+          // The source is not computed in this block
+          if (srcInfo.lastWrite == 0)
+            continue;
+          // dst is read after src is written. We cannot overwrite dst
+          if (dstInfo.lastRead > srcInfo.lastWrite)
+            continue;
+          // We are good. We first patch the destination then all the sources
+          replaceDst(srcInfo.lastWriteInsn, src, dst);
+          // Then we patch all subsequent uses of the source
+          ir::Instruction *next = static_cast<ir::Instruction*>(srcInfo.lastWriteInsn->next);
+          while (next != insn) {
+            replaceSrc(next, src, dst);
+            next = static_cast<ir::Instruction*>(next->next);
+          }
+          insn->remove();
+        } else if (op == ir::OP_LOADI)
+          continue;
+        else
+          break;
+      }
+    });
+  }
+
+  void GenWriter::removeLOADIs(const ir::Liveness &liveness, ir::Function &fn)
+  {
+    // We store the last write and last read for each register
+    const uint32_t regNum = fn.regNum();
+    vector<RegInfoForMov> lastUse;
+    lastUse.resize(regNum);
+
+    // Traverse all blocks and remove redundant immediates. Do *not* remove
+    // immediates that outlive the block
+    fn.foreachBlock([&](ir::BasicBlock &bb)
+    {
+      // Each immediate that is already loaded in the block
+      map<ir::Immediate, ir::Register> loadedImm;
+
+      // Immediate to immediate translation
+      map<ir::Register, ir::Register> immTranslate;
+
+      // Liveinfo helps us to know if the loaded immediate outlives the block
+      const ir::Liveness::BlockInfo &info = liveness.getBlockInfo(&bb);
+
+      // We need to know when each register will be read or written
+      buildRegInfo(bb, lastUse);
+
+      // Top bottom traversal -> remove useless LOADIs
+      uint32_t insnID = 2;
+      bb.foreach([&](ir::Instruction &insn)
+      {
+        // We either try to remove the LOADI or we will try to use it as a
+        // replacement for the next same LOADIs
+        if (insn.isMemberOf<ir::LoadImmInstruction>()) {
+          ir::LoadImmInstruction &loadImm = cast<ir::LoadImmInstruction>(insn);
+          const ir::Immediate imm = loadImm.getImmediate();
+          const ir::Register dst = loadImm.getDst(0);
+
+          // Not here: cool, we put it in the map if the register is not
+          // overwritten. If it is, we just ignore it for simplicity. Note that
+          // it should not happen with the way we "unSSA" the code
+          auto it = loadedImm.find(imm);
+          auto end = loadedImm.end();
+          if (it == end && lastUse[dst].lastWrite == insnID+1)
+            loadedImm.insert(std::make_pair(imm, dst));
+          // We already pushed the same immediate and we do not outlive the
+          // block. We are good to replace this immediate by the previous one
+          else if (it != end && info.inLiveOut(dst) == false) {
+            immTranslate.insert(std::make_pair(dst, it->second));
+            insn.remove();
+          }
+        }
+        // Traverse all the destinations and sources and perform the
+        // substitutions (if any)
+        else {
+          const uint32_t srcNum = insn.getSrcNum();
+          const uint32_t dstNum = insn.getDstNum();
+          for (uint32_t srcID = 0; srcID < srcNum; ++srcID) {
+            const ir::Register src = insn.getSrc(srcID);
+            auto it = immTranslate.find(src);
+            if (it != immTranslate.end())
+              insn.setSrc(srcID, it->second);
+          }
+          for (uint32_t dstID = 0; dstID < dstNum; ++dstID) {
+            const ir::Register dst = insn.getDst(dstID);
+            auto it = immTranslate.find(dst);
+            if (it != immTranslate.end())
+              insn.setDst(dstID, it->second);
+          }
+        }
+        insnID += 2;
+      });
+    });
+  }
+
+  BVAR(OCL_OPTIMIZE_PHI_MOVES, true);
+  BVAR(OCL_OPTIMIZE_LOADI, true);
+
+  static const Instruction *getInstructionUseLocal(const Value *v) {
+    // Local variable can only be used in one kernel function. So, if we find
+    // one instruction that use the local variable, simply return.
+    const Instruction *insn = NULL;
+    for(Value::const_use_iterator iter = v->use_begin(); iter != v->use_end(); ++iter) {
+    // After LLVM 3.5, use_iterator points to 'Use' instead of 'User', which is more straightforward.
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+      const User *theUser = *iter;
+#else
+      const User *theUser = iter->getUser();
+#endif
+      if(isa<Instruction>(theUser)) return cast<const Instruction>(theUser);
+      insn = getInstructionUseLocal(theUser);
+      if(insn != NULL) break;
+    }
+    return insn;
+  }
+
+  void GenWriter::allocateGlobalVariableRegister(Function &F)
+  {
+    // Allocate a address register for each global variable
+    const Module::GlobalListType &globalList = TheModule->getGlobalList();
+    size_t j = 0;
+    for(auto i = globalList.begin(); i != globalList.end(); i ++) {
+      const GlobalVariable &v = *i;
+      if(!v.isConstantUsed()) continue;
+
+      ir::AddressSpace addrSpace = addressSpaceLLVMToGen(v.getType()->getAddressSpace());
+      if(addrSpace == ir::MEM_LOCAL) {
+        const Value * val = cast<Value>(&v);
+        const Instruction *insn = getInstructionUseLocal(val);
+        GBE_ASSERT(insn && "Can't find a valid reference instruction for local variable.");
+
+        const BasicBlock * bb = insn->getParent();
+        const Function * func = bb->getParent();
+        if(func != &F) continue;
+
+        ir::Function &f = ctx.getFunction();
+        f.setUseSLM(true);
+        const Constant *c = v.getInitializer();
+        Type *ty = c->getType();
+        uint32_t oldSlm = f.getSLMSize();
+        uint32_t align = 8 * getAlignmentByte(unit, ty);
+        uint32_t padding = getPadding(oldSlm*8, align);
+
+        f.setSLMSize(oldSlm + padding/8 + getTypeByteSize(unit, ty));
+
+        this->newRegister(const_cast<GlobalVariable*>(&v));
+        ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
+        ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(oldSlm + padding/8, ir::TYPE_S32));
+      } else if(addrSpace == ir::MEM_CONSTANT) {
+        GBE_ASSERT(v.hasInitializer());
+        this->newRegister(const_cast<GlobalVariable*>(&v));
+        ir::Register reg = regTranslator.getScalar(const_cast<GlobalVariable*>(&v), 0);
+        ir::Constant &con = unit.getConstantSet().getConstant(j ++);
+        GBE_ASSERT(con.getName() == v.getName());
+        ctx.LOADI(ir::TYPE_S32, reg, ctx.newIntegerImmediate(con.getOffset(), ir::TYPE_S32));
+      } else {
+        if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) {
+          ctx.appendSurface(btiBase, ir::ocl::printfbptr);
+          ctx.getFunction().getPrintfSet()->setBufBTI(btiBase);
+          globalPointer.insert(std::make_pair(&v, incBtiBase()));
+          regTranslator.newScalarProxy(ir::ocl::printfbptr, const_cast<GlobalVariable*>(&v));
+        } else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
+          ctx.appendSurface(btiBase, ir::ocl::printfiptr);
+          ctx.getFunction().getPrintfSet()->setIndexBufBTI(btiBase);
+          globalPointer.insert(std::make_pair(&v, incBtiBase()));
+          regTranslator.newScalarProxy(ir::ocl::printfiptr, const_cast<GlobalVariable*>(&v));
+	} else if(v.getName().str().substr(0, 4) == ".str") {
+          /* When there are multi printf statements in multi kernel fucntions within the same
+             translate unit, if they have the same sting parameter, such as
+             kernel_func1 () {
+               printf("Line is %d\n", line_num1);
+             }
+             kernel_func2 () {
+               printf("Line is %d\n", line_num2);
+             }
+             The Clang will just generate one global string named .strXXX to represent "Line is %d\n"
+             So when translating the kernel_func1, we can not unref that global var, so we will
+             get here. Just ignore it to avoid assert. */
+        } else {
+          GBE_ASSERT(0);
+        }
+      }
+    }
+
+  }
+  static INLINE void findAllLoops(LoopInfo * LI, std::vector<std::pair<Loop*, int>> &lp)
+  {
+      for (Loop::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I) {
+        lp.push_back(std::make_pair(*I, -1));
+      }
+      if (lp.size() == 0) return;
+
+      uint32_t i = 0;
+      do {
+        const std::vector<Loop*> subLoops = lp[i].first->getSubLoops();
+        for(auto sub : subLoops)
+          lp.push_back(std::make_pair(sub, i));
+        i++;
+      } while(i < lp.size());
+  }
+
+  void GenWriter::gatherLoopInfo(ir::Function &fn) {
+    vector<ir::LabelIndex> loopBBs;
+    vector<std::pair<ir::LabelIndex, ir::LabelIndex>> loopExits;
+    std::vector<std::pair<Loop*, int>> lp;
+
+    findAllLoops(LI, lp);
+#if GBE_DEBUG
+    // check two loops' interference
+    for(unsigned int i = 0; i < lp.size(); i++) {
+        SmallVector<Loop::Edge, 8> exitBBs;
+        lp[i].first->getExitEdges(exitBBs);
+
+      const std::vector<BasicBlock*> &inBBs = lp[i].first->getBlocks();
+      std::vector<ir::LabelIndex> bbs1;
+      for(auto x : inBBs) {
+        bbs1.push_back(labelMap[x]);
+      }
+      std::sort(bbs1.begin(), bbs1.end());
+      for(unsigned int j = i+1; j < lp.size(); j++) {
+        if(! lp[i].first->contains(lp[j].first)) {
+          const std::vector<BasicBlock*> &inBBs2 = lp[j].first->getBlocks();
+          std::vector<ir::LabelIndex> bbs2;
+          std::vector<ir::LabelIndex> bbs3;
+
+          for(auto x : inBBs2) {
+            bbs2.push_back(labelMap[x]);
+          }
+
+          std::sort(bbs2.begin(), bbs2.end());
+          std::set_intersection(bbs1.begin(), bbs1.end(), bbs2.begin(), bbs2.end(), std::back_inserter(bbs3));
+          GBE_ASSERT(bbs3.size() < 1);
+        }
+      }
+    }
+#endif
+
+    for (auto loop : lp) {
+      loopBBs.clear();
+      loopExits.clear();
+
+      const std::vector<BasicBlock*> &inBBs = loop.first->getBlocks();
+      for (auto b : inBBs) {
+        GBE_ASSERT(labelMap.find(b) != labelMap.end());
+        loopBBs.push_back(labelMap[b]);
+      }
+
+      SmallVector<Loop::Edge, 8> exitBBs;
+      loop.first->getExitEdges(exitBBs);
+      for(auto b : exitBBs){
+        GBE_ASSERT(labelMap.find(b.first) != labelMap.end());
+        GBE_ASSERT(labelMap.find(b.second) != labelMap.end());
+        loopExits.push_back(std::make_pair(labelMap[b.first], labelMap[b.second]));
+      }
+      fn.addLoop(loopBBs, loopExits);
+    }
+  }
+
+  void GenWriter::emitFunction(Function &F)
+  {
+    switch (F.getCallingConv()) {
+#if LLVM_VERSION_MINOR <= 2
+      case CallingConv::PTX_Device: // we do not emit device function
+        return;
+      case CallingConv::PTX_Kernel:
+#else
+      case CallingConv::C:
+#endif
+        break;
+      default: GBE_ASSERTM(false, "Unsupported calling convention");
+    }
+
+    ctx.startFunction(F.getName());
+    ir::Function &fn = ctx.getFunction();
+    this->regTranslator.clear();
+    this->labelMap.clear();
+    this->emitFunctionPrototype(F);
+
+    this->allocateGlobalVariableRegister(F);
+    // Visit all the instructions and emit the IR registers or the value to
+    // value mapping when a new register is not needed
+    pass = PASS_EMIT_REGISTERS;
+    for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
+      visit(*I);
+
+    // First create all the labels (one per block) ...
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      this->newLabelIndex(BB);
+
+    // Then, for all branch instructions that have conditions, see if we can
+    // simplify the code by inverting condition code
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      this->simplifyTerminator(BB);
+
+    // gather loop info, which is useful for liveness analysis
+    gatherLoopInfo(fn);
+
+    // ... then, emit the instructions for all basic blocks
+    pass = PASS_EMIT_INSTRUCTIONS;
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      emitBasicBlock(BB);
+    ctx.endFunction();
+
+    // Liveness can be shared when we optimized the immediates and the MOVs
+    ir::Liveness liveness(fn);
+
+    if (OCL_OPTIMIZE_LOADI) this->removeLOADIs(liveness, fn);
+    if (OCL_OPTIMIZE_PHI_MOVES) this->optimizePhiCopy(liveness, fn);
+    if (OCL_OPTIMIZE_PHI_MOVES) this->removeMOVs(liveness, fn);
+  }
+
+  void GenWriter::regAllocateReturnInst(ReturnInst &I) {}
+
+  void GenWriter::emitReturnInst(ReturnInst &I) {
+    const ir::Function &fn = ctx.getFunction();
+    GBE_ASSERTM(fn.outputNum() <= 1, "no more than one value can be returned");
+    if (fn.outputNum() == 1 && I.getNumOperands() > 0) {
+      const ir::Register dst = fn.getOutput(0);
+      const ir::Register src = this->getRegister(I.getOperand(0));
+      const ir::RegisterFamily family = fn.getRegisterFamily(dst);
+      ctx.MOV(ir::getType(family), dst, src);
+    }
+    ctx.RET();
+  }
+
+  void GenWriter::regAllocateBinaryOperator(Instruction &I) {
+    this->newRegister(&I);
+  }
+
+  void GenWriter::emitBinaryOperator(Instruction &I) {
+#if GBE_DEBUG
+    GBE_ASSERT(I.getType()->isPointerTy() == false);
+    // We accept logical operations on booleans
+    switch (I.getOpcode()) {
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
+        break;
+      default:
+        GBE_ASSERT(I.getType() != Type::getInt1Ty(I.getContext()));
+    }
+#endif /* GBE_DEBUG */
+
+    // Get the element type for a vector
+    const ir::Type type = getType(ctx, I.getType());
+
+    // Emit the instructions in a row
+    const ir::Register dst = this->getRegister(&I);
+    const ir::Register src0 = this->getRegister(I.getOperand(0));
+    const ir::Register src1 = this->getRegister(I.getOperand(1));
+
+    switch (I.getOpcode()) {
+      case Instruction::Add:
+      case Instruction::FAdd: ctx.ADD(type, dst, src0, src1); break;
+      case Instruction::Sub:
+      case Instruction::FSub: ctx.SUB(type, dst, src0, src1); break;
+      case Instruction::Mul:
+      case Instruction::FMul: ctx.MUL(type, dst, src0, src1); break;
+      case Instruction::URem: ctx.REM(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
+      case Instruction::SRem:
+      case Instruction::FRem: ctx.REM(type, dst, src0, src1); break;
+      case Instruction::UDiv: ctx.DIV(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
+      case Instruction::SDiv:
+      case Instruction::FDiv: ctx.DIV(type, dst, src0, src1); break;
+      case Instruction::And:  ctx.AND(type, dst, src0, src1); break;
+      case Instruction::Or:   ctx.OR(type, dst, src0, src1); break;
+      case Instruction::Xor:  ctx.XOR(type, dst, src0, src1); break;
+      case Instruction::Shl:  ctx.SHL(type, dst, src0, src1); break;
+      case Instruction::LShr: ctx.SHR(getUnsignedType(ctx, I.getType()), dst, src0, src1); break;
+      case Instruction::AShr: ctx.ASR(type, dst, src0, src1); break;
+      default: NOT_SUPPORTED;
+    }
+  }
+
+  void GenWriter::regAllocateICmpInst(ICmpInst &I) {
+    this->newRegister(&I);
+  }
+
+  static ir::Type makeTypeSigned(const ir::Type &type) {
+    if (type == ir::TYPE_U8) return ir::TYPE_S8;
+    else if (type == ir::TYPE_U16) return ir::TYPE_S16;
+    else if (type == ir::TYPE_U32) return ir::TYPE_S32;
+    else if (type == ir::TYPE_U64) return ir::TYPE_S64;
+    return type;
+  }
+
+  static ir::Type makeTypeUnsigned(const ir::Type &type) {
+    if (type == ir::TYPE_S8) return ir::TYPE_U8;
+    else if (type == ir::TYPE_S16) return ir::TYPE_U16;
+    else if (type == ir::TYPE_S32) return ir::TYPE_U32;
+    else if (type == ir::TYPE_S64) return ir::TYPE_U64;
+    return type;
+  }
+
+  void GenWriter::emitICmpInst(ICmpInst &I) {
+    GBE_ASSERT(I.getOperand(0)->getType() != Type::getInt1Ty(I.getContext()));
+
+    // Get the element type and the number of elements
+    Type *operandType = I.getOperand(0)->getType();
+    const ir::Type type = getType(ctx, operandType);
+    const ir::Type signedType = makeTypeSigned(type);
+    const ir::Type unsignedType = makeTypeUnsigned(type);
+
+    // Emit the instructions in a row
+    const ir::Register dst = this->getRegister(&I);
+    const ir::Register src0 = this->getRegister(I.getOperand(0));
+    const ir::Register src1 = this->getRegister(I.getOperand(1));
+
+    // We must invert the condition to simplify the branch code
+    if (conditionSet.find(&I) != conditionSet.end()) {
+      switch (I.getPredicate()) {
+        case ICmpInst::ICMP_EQ:  ctx.NE(type, dst, src0, src1); break;
+        case ICmpInst::ICMP_NE:  ctx.EQ(type, dst, src0, src1); break;
+        case ICmpInst::ICMP_ULE: ctx.GT((unsignedType), dst, src0, src1); break;
+        case ICmpInst::ICMP_SLE: ctx.GT(signedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_UGE: ctx.LT(unsignedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_SGE: ctx.LT(signedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_ULT: ctx.GE(unsignedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_SLT: ctx.GE(signedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_UGT: ctx.LE(unsignedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_SGT: ctx.LE(signedType, dst, src0, src1); break;
+        default: NOT_SUPPORTED;
+      }
+    }
+    // Nothing special to do
+    else {
+      switch (I.getPredicate()) {
+        case ICmpInst::ICMP_EQ:  ctx.EQ(type, dst, src0, src1); break;
+        case ICmpInst::ICMP_NE:  ctx.NE(type, dst, src0, src1); break;
+        case ICmpInst::ICMP_ULE: ctx.LE((unsignedType), dst, src0, src1); break;
+        case ICmpInst::ICMP_SLE: ctx.LE(signedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_UGE: ctx.GE(unsignedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_SGE: ctx.GE(signedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_ULT: ctx.LT(unsignedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_SLT: ctx.LT(signedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_UGT: ctx.GT(unsignedType, dst, src0, src1); break;
+        case ICmpInst::ICMP_SGT: ctx.GT(signedType, dst, src0, src1); break;
+        default: NOT_SUPPORTED;
+      }
+    }
+  }
+
+  void GenWriter::regAllocateFCmpInst(FCmpInst &I) {
+    this->newRegister(&I);
+  }
+
+  void GenWriter::emitFCmpInst(FCmpInst &I) {
+
+    // Get the element type and the number of elements
+    Type *operandType = I.getOperand(0)->getType();
+    const ir::Type type = getType(ctx, operandType);
+    const ir::Type insnType = getType(ctx, I.getType());
+
+    // Emit the instructions in a row
+    const ir::Register dst = this->getRegister(&I);
+    const ir::Register src0 = this->getRegister(I.getOperand(0));
+    const ir::Register src1 = this->getRegister(I.getOperand(1));
+    const ir::Register tmp = ctx.reg(getFamily(ctx, I.getType()));
+    Value *cv = ConstantInt::get(I.getType(), 1);
+
+    switch (I.getPredicate()) {
+      case ICmpInst::FCMP_OEQ: ctx.EQ(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_ONE: ctx.NE(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_OLE: ctx.LE(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_OGE: ctx.GE(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_OLT: ctx.LT(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_OGT: ctx.GT(type, dst, src0, src1); break;
+      case ICmpInst::FCMP_ORD:
+        //If there is a constant between src0 and src1, this constant value
+        //must ordered, otherwise, llvm will optimize the instruction to ture.
+        //So discard this constant value, only compare the other src.
+        if(isa<ConstantFP>(I.getOperand(0)))
+          ctx.EQ(type, dst, src1, src1);
+        else if(isa<ConstantFP>(I.getOperand(1)))
+          ctx.EQ(type, dst, src0, src0);
+        else
+          ctx.ORD(type, dst, src0, src1);
+        break;
+      case ICmpInst::FCMP_UNO:
+        if(isa<ConstantFP>(I.getOperand(0)))
+          ctx.NE(type, dst, src1, src1);
+        else if(isa<ConstantFP>(I.getOperand(1)))
+          ctx.NE(type, dst, src0, src0);
+        else {
+          ctx.ORD(type, tmp, src0, src1);
+          ctx.XOR(insnType, dst, tmp, getRegister(cv));  //TODO: Use NOT directly
+        }
+        break;
+      case ICmpInst::FCMP_UEQ:
+        ctx.NE(type, tmp, src0, src1);
+        ctx.XOR(insnType, dst, tmp, getRegister(cv));
+        break;
+      case ICmpInst::FCMP_UGT:
+        ctx.LE(type, tmp, src0, src1);
+        ctx.XOR(insnType, dst, tmp, getRegister(cv));
+        break;
+      case ICmpInst::FCMP_UGE:
+        ctx.LT(type, tmp, src0, src1);
+        ctx.XOR(insnType, dst, tmp, getRegister(cv));
+        break;
+      case ICmpInst::FCMP_ULT:
+        ctx.GE(type, tmp, src0, src1);
+        ctx.XOR(insnType, dst, tmp, getRegister(cv));
+        break;
+      case ICmpInst::FCMP_ULE:
+        ctx.GT(type, tmp, src0, src1);
+        ctx.XOR(insnType, dst, tmp, getRegister(cv));
+        break;
+      case ICmpInst::FCMP_UNE:
+        ctx.EQ(type, tmp, src0, src1);
+        ctx.XOR(insnType, dst, tmp, getRegister(cv));
+        break;
+      case ICmpInst::FCMP_TRUE:
+        ctx.MOV(insnType, dst, getRegister(cv));
+        break;
+      default: NOT_SUPPORTED;
+    }
+  }
+
+  void GenWriter::regAllocateCastInst(CastInst &I) {
+    Value *dstValue = &I;
+    Value *srcValue = I.getOperand(0);
+    const auto op = I.getOpcode();
+
+    switch (op)
+    {
+      // When casting pointer to integers, be aware with integers
+      case Instruction::PtrToInt:
+      case Instruction::IntToPtr:
+      {
+        Constant *CPV = dyn_cast<Constant>(srcValue);
+        if (CPV == NULL) {
+#if GBE_DEBUG
+          Type *dstType = dstValue->getType();
+          Type *srcType = srcValue->getType();
+          GBE_ASSERT(getTypeByteSize(unit, dstType) == getTypeByteSize(unit, srcType));
+#endif /* GBE_DEBUG */
+          regTranslator.newValueProxy(srcValue, dstValue);
+        } else
+          this->newRegister(dstValue);
+      }
+      break;
+      // Bitcast just forward registers
+      case Instruction::BitCast:
+      {
+        Type *srcType = srcValue->getType();
+        Type *dstType = dstValue->getType();
+
+        if(srcType->isVectorTy() || dstType->isVectorTy())
+          this->newRegister(dstValue);
+        else
+          regTranslator.newValueProxy(srcValue, dstValue);
+      }
+      break;
+      // Various conversion operations -> just allocate registers for them
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+      case Instruction::SExt:
+      case Instruction::ZExt:
+      case Instruction::FPExt:
+      case Instruction::FPTrunc:
+      case Instruction::Trunc:
+        this->newRegister(&I);
+      break;
+      default: NOT_SUPPORTED;
+    }
+  }
+
+  void GenWriter::emitCastInst(CastInst &I) {
+    switch (I.getOpcode())
+    {
+      case Instruction::PtrToInt:
+      case Instruction::IntToPtr:
+      {
+        Value *dstValue = &I;
+        Value *srcValue = I.getOperand(0);
+        Constant *CPV = dyn_cast<Constant>(srcValue);
+        if (CPV != NULL) {
+          const ir::ImmediateIndex index = ctx.newImmediate(CPV);
+          const ir::Immediate imm = ctx.getImmediate(index);
+          const ir::Register reg = this->getRegister(dstValue);
+          ctx.LOADI(imm.getType(), reg, index);
+        }
+      }
+      break;
+      case Instruction::BitCast:
+      {
+        Value *srcValue = I.getOperand(0);
+        Value *dstValue = &I;
+        uint32_t srcElemNum = 0, dstElemNum = 0 ;
+        ir::Type srcType = getVectorInfo(ctx, srcValue->getType(), srcValue, srcElemNum);
+        ir::Type dstType = getVectorInfo(ctx, dstValue->getType(), dstValue, dstElemNum);
+        // As long and double are not compatible in register storage
+        // and we do not support double yet, simply put an assert here
+        GBE_ASSERT(!(srcType == ir::TYPE_S64 && dstType == ir::TYPE_DOUBLE));
+        GBE_ASSERT(!(dstType == ir::TYPE_S64 && srcType == ir::TYPE_DOUBLE));
+
+        if(srcElemNum > 1 || dstElemNum > 1) {
+          // Build the tuple data in the vector
+          vector<ir::Register> srcTupleData;
+          vector<ir::Register> dstTupleData;
+          uint32_t elemID = 0;
+          for (elemID = 0; elemID < srcElemNum; ++elemID) {
+            ir::Register reg;
+            reg = this->getRegister(srcValue, elemID);
+            srcTupleData.push_back(reg);
+          }
+          for (elemID = 0; elemID < dstElemNum; ++elemID) {
+            ir::Register reg;
+            reg = this->getRegister(dstValue, elemID);
+            dstTupleData.push_back(reg);
+          }
+
+          const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], srcElemNum);
+          const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], dstElemNum);
+
+          ctx.BITCAST(dstType, srcType, dstTuple, srcTuple, dstElemNum, srcElemNum);
+        }
+      }
+      break; // nothing to emit here
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+      case Instruction::SExt:
+      case Instruction::ZExt:
+      case Instruction::FPExt:
+      case Instruction::FPTrunc:
+      case Instruction::Trunc:
+      {
+        // Get the element type for a vector
+        Type *llvmDstType = I.getType();
+        Type *llvmSrcType = I.getOperand(0)->getType();
+        ir::Type dstType;
+        if (I.getOpcode() == Instruction::FPToUI)
+          dstType = getUnsignedType(ctx, llvmDstType);
+        else
+          dstType = getType(ctx, llvmDstType);
+        ir::Type srcType;
+        if (I.getOpcode() == Instruction::ZExt || I.getOpcode() == Instruction::UIToFP) {
+          srcType = getUnsignedType(ctx, llvmSrcType);
+        } else {
+          srcType = getType(ctx, llvmSrcType);
+        }
+
+        // We use a select (0,1) not a convert when the destination is a boolean
+        if (srcType == ir::TYPE_BOOL) {
+          const ir::RegisterFamily family = getFamily(dstType);
+          const ir::ImmediateIndex zero = ctx.newIntegerImmediate(0, dstType);
+          ir::ImmediateIndex one;
+          if (I.getOpcode() == Instruction::SExt
+              && (dstType == ir::TYPE_S8 || dstType == ir::TYPE_S16 || dstType == ir::TYPE_S32 || dstType == ir::TYPE_S64))
+            one = ctx.newIntegerImmediate(-1, dstType);
+          else
+            one = ctx.newIntegerImmediate(1, dstType);
+          const ir::Register zeroReg = ctx.reg(family);
+          const ir::Register oneReg = ctx.reg(family);
+          ctx.LOADI(dstType, zeroReg, zero);
+          ctx.LOADI(dstType, oneReg, one);
+          const ir::Register dst = this->getRegister(&I);
+          const ir::Register src = this->getRegister(I.getOperand(0));
+          ctx.SEL(dstType, dst, src, oneReg, zeroReg);
+        }
+        // Use a convert for the other cases
+        else {
+          const ir::Register dst = this->getRegister(&I);
+          const ir::Register src = this->getRegister(I.getOperand(0));
+          ctx.CVT(dstType, srcType, dst, src);
+        }
+      }
+      break;
+      default: NOT_SUPPORTED;
+    }
+  }
+
+  /*! Because there are still fake insert/extract instruction for
+   *  load/store, so keep empty function here */
+  void GenWriter::regAllocateInsertElement(InsertElementInst &I) {}
+  void GenWriter::emitInsertElement(InsertElementInst &I) {
+    const VectorType *type = dyn_cast<VectorType>(I.getType());
+    GBE_ASSERT(type);
+    const int elemNum = type->getNumElements();
+
+    Value *vec = I.getOperand(0);
+    Value *value = I.getOperand(1);
+    const Value *index = I.getOperand(2);
+    const ConstantInt *c = dyn_cast<ConstantInt>(index);
+    int i = c->getValue().getSExtValue();
+
+    for(int j=0; j<elemNum; j++) {
+      if(i == j)
+        regTranslator.newValueProxy(value, &I, 0, i);
+      else
+        regTranslator.newValueProxy(vec, &I, j, j);
+    }
+  }
+
+  void GenWriter::regAllocateExtractElement(ExtractElementInst &I) {
+    Value *vec = I.getVectorOperand();
+    const Value *index = I.getIndexOperand();
+    const ConstantInt *c = dyn_cast<ConstantInt>(index);
+    GBE_ASSERT(c);
+    int i = c->getValue().getSExtValue();
+    regTranslator.newValueProxy(vec, &I, i, 0);
+  }
+
+  void GenWriter::emitExtractElement(ExtractElementInst &I) {
+  }
+
+  void GenWriter::regAllocateShuffleVectorInst(ShuffleVectorInst &I) {}
+  void GenWriter::emitShuffleVectorInst(ShuffleVectorInst &I) {}
+
+  void GenWriter::regAllocateSelectInst(SelectInst &I) {
+    this->newRegister(&I);
+  }
+
+  void GenWriter::emitSelectInst(SelectInst &I) {
+    // Get the element type for a vector
+    const ir::Type type = getType(ctx, I.getType());
+
+    // Emit the instructions in a row
+    const ir::Register dst = this->getRegister(&I);
+    const ir::Register cond = this->getRegister(I.getOperand(0));
+    const ir::Register src0 = this->getRegister(I.getOperand(1));
+    const ir::Register src1 = this->getRegister(I.getOperand(2));
+    ctx.SEL(type, dst, cond, src0, src1);
+  }
+
+  void GenWriter::regAllocatePHINode(PHINode &I) {
+    // Copy 1 for the PHI
+    this->newRegister(&I);
+    // Copy 2 to avoid lost copy issue
+    Value *copy = this->getPHICopy(&I);
+    this->newRegister(&I, copy);
+  }
+
+  void GenWriter::emitPHINode(PHINode &I) {
+    Value *copy = this->getPHICopy(&I);
+    const ir::Type type = getType(ctx, I.getType());
+
+    const ir::Register dst = this->getRegister(&I);
+    const ir::Register src = this->getRegister(copy);
+    ctx.MOV(type, dst, src);
+    phiMap.insert(std::make_pair(dst, src));
+  }
+
+  void GenWriter::regAllocateBranchInst(BranchInst &I) {}
+
+  void GenWriter::emitBranchInst(BranchInst &I) {
+    // Emit MOVs if required
+    BasicBlock *bb = I.getParent();
+    this->emitMovForPHI(bb, I.getSuccessor(0));
+    if (I.isConditional())
+      this->emitMovForPHI(bb, I.getSuccessor(1));
+
+    // Inconditional branch. Just check that we jump to a block which is not our
+    // successor
+    if (I.isConditional() == false) {
+      BasicBlock *target = I.getSuccessor(0);
+      if (std::next(Function::iterator(bb)) != Function::iterator(target)) {
+        GBE_ASSERT(labelMap.find(target) != labelMap.end());
+        const ir::LabelIndex labelIndex = labelMap[target];
+        ctx.BRA(labelIndex);
+      }
+    }
+    // The LLVM branch has two targets
+    else {
+      BasicBlock *taken = NULL, *nonTaken = NULL;
+      Value *condition = I.getCondition();
+
+      // We may inverted the branch condition to simplify the branching code
+      const bool inverted = conditionSet.find(condition) != conditionSet.end();
+      taken = inverted ? I.getSuccessor(1) : I.getSuccessor(0);
+      nonTaken = inverted ? I.getSuccessor(0) : I.getSuccessor(1);
+
+      // Get both taken label and predicate register
+      GBE_ASSERT(labelMap.find(taken) != labelMap.end());
+      const ir::LabelIndex index = labelMap[taken];
+      const ir::Register reg = this->getRegister(condition);
+      ctx.BRA(index, reg);
+
+      // If non-taken target is the next block, there is nothing to do
+      BasicBlock *bb = I.getParent();
+      if (std::next(Function::iterator(bb)) == Function::iterator(nonTaken))
+        return;
+
+      // This is slightly more complicated here. We need to issue one more
+      // branch for the non-taken condition.
+      GBE_ASSERT(labelMap.find(nonTaken) != labelMap.end());
+      const ir::LabelIndex untakenIndex = ctx.label();
+      ctx.LABEL(untakenIndex);
+      ctx.BRA(labelMap[nonTaken]);
+    }
+  }
+
+  void GenWriter::regAllocateCallInst(CallInst &I) {
+    Value *dst = &I;
+    Value *Callee = I.getCalledValue();
+    GBE_ASSERT(ctx.getFunction().getProfile() == ir::PROFILE_OCL);
+    GBE_ASSERT(isa<InlineAsm>(I.getCalledValue()) == false);
+    GBE_ASSERT(I.hasStructRetAttr() == false);
+
+    // We only support a small number of intrinsics right now
+    if (Function *F = I.getCalledFunction()) {
+      const Intrinsic::ID intrinsicID = (Intrinsic::ID) F->getIntrinsicID();
+      if (intrinsicID != 0) {
+        switch (F->getIntrinsicID()) {
+          case Intrinsic::stacksave:
+            this->newRegister(&I);
+          break;
+          case Intrinsic::stackrestore:
+          break;
+#if LLVM_VERSION_MINOR >= 2
+          case Intrinsic::lifetime_start:
+          case Intrinsic::lifetime_end:
+          break;
+          case Intrinsic::fmuladd:
+            this->newRegister(&I);
+          break;
+#endif /* LLVM_VERSION_MINOR >= 2 */
+          case Intrinsic::debugtrap:
+          case Intrinsic::dbg_value:
+          case Intrinsic::dbg_declare:
+          break;
+          default:
+          GBE_ASSERTM(false, "Unsupported intrinsics");
+        }
+        return;
+      }
+    }
+
+    // Get the name of the called function and handle it
+    const std::string fnName = Callee->getName();
+    auto it = instrinsicMap.map.find(fnName);
+    GBE_ASSERT(it != instrinsicMap.map.end());
+    switch (it->second) {
+      case GEN_OCL_GET_GROUP_ID0:
+        regTranslator.newScalarProxy(ir::ocl::groupid0, dst); break;
+      case GEN_OCL_GET_GROUP_ID1:
+        regTranslator.newScalarProxy(ir::ocl::groupid1, dst); break;
+      case GEN_OCL_GET_GROUP_ID2:
+        regTranslator.newScalarProxy(ir::ocl::groupid2, dst); break;
+      case GEN_OCL_GET_LOCAL_ID0:
+        regTranslator.newScalarProxy(ir::ocl::lid0, dst); break;
+      case GEN_OCL_GET_LOCAL_ID1:
+        regTranslator.newScalarProxy(ir::ocl::lid1, dst); break;
+      case GEN_OCL_GET_LOCAL_ID2:
+        regTranslator.newScalarProxy(ir::ocl::lid2, dst); break;
+      case GEN_OCL_GET_NUM_GROUPS0:
+        regTranslator.newScalarProxy(ir::ocl::numgroup0, dst); break;
+      case GEN_OCL_GET_NUM_GROUPS1:
+        regTranslator.newScalarProxy(ir::ocl::numgroup1, dst); break;
+      case GEN_OCL_GET_NUM_GROUPS2:
+        regTranslator.newScalarProxy(ir::ocl::numgroup2, dst); break;
+      case GEN_OCL_GET_LOCAL_SIZE0:
+        regTranslator.newScalarProxy(ir::ocl::lsize0, dst); break;
+      case GEN_OCL_GET_LOCAL_SIZE1:
+        regTranslator.newScalarProxy(ir::ocl::lsize1, dst); break;
+      case GEN_OCL_GET_LOCAL_SIZE2:
+        regTranslator.newScalarProxy(ir::ocl::lsize2, dst); break;
+      case GEN_OCL_GET_GLOBAL_SIZE0:
+        regTranslator.newScalarProxy(ir::ocl::gsize0, dst); break;
+      case GEN_OCL_GET_GLOBAL_SIZE1:
+        regTranslator.newScalarProxy(ir::ocl::gsize1, dst); break;
+      case GEN_OCL_GET_GLOBAL_SIZE2:
+        regTranslator.newScalarProxy(ir::ocl::gsize2, dst); break;
+      case GEN_OCL_GET_GLOBAL_OFFSET0:
+        regTranslator.newScalarProxy(ir::ocl::goffset0, dst); break;
+      case GEN_OCL_GET_GLOBAL_OFFSET1:
+        regTranslator.newScalarProxy(ir::ocl::goffset1, dst); break;
+      case GEN_OCL_GET_GLOBAL_OFFSET2:
+        regTranslator.newScalarProxy(ir::ocl::goffset2, dst); break;
+      case GEN_OCL_GET_WORK_DIM:
+        regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
+      case GEN_OCL_PRINTF_BUF_ADDR:
+        regTranslator.newScalarProxy(ir::ocl::printfbptr, dst); break;
+      case GEN_OCL_PRINTF_INDEX_BUF_ADDR:
+        regTranslator.newScalarProxy(ir::ocl::printfiptr, dst); break;
+      case GEN_OCL_FBH:
+      case GEN_OCL_FBL:
+      case GEN_OCL_COS:
+      case GEN_OCL_SIN:
+      case GEN_OCL_SQR:
+      case GEN_OCL_RSQ:
+      case GEN_OCL_LOG:
+      case GEN_OCL_EXP:
+      case GEN_OCL_POW:
+      case GEN_OCL_RCP:
+      case GEN_OCL_ABS:
+      case GEN_OCL_FABS:
+      case GEN_OCL_RNDZ:
+      case GEN_OCL_RNDE:
+      case GEN_OCL_RNDU:
+      case GEN_OCL_RNDD:
+      case GEN_OCL_GET_IMAGE_WIDTH:
+      case GEN_OCL_GET_IMAGE_HEIGHT:
+      case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
+      case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
+      case GEN_OCL_GET_IMAGE_DEPTH:
+      case GEN_OCL_ATOMIC_ADD0:
+      case GEN_OCL_ATOMIC_ADD1:
+      case GEN_OCL_ATOMIC_SUB0:
+      case GEN_OCL_ATOMIC_SUB1:
+      case GEN_OCL_ATOMIC_AND0:
+      case GEN_OCL_ATOMIC_AND1:
+      case GEN_OCL_ATOMIC_OR0:
+      case GEN_OCL_ATOMIC_OR1:
+      case GEN_OCL_ATOMIC_XOR0:
+      case GEN_OCL_ATOMIC_XOR1:
+      case GEN_OCL_ATOMIC_XCHG0:
+      case GEN_OCL_ATOMIC_XCHG1:
+      case GEN_OCL_ATOMIC_UMAX0:
+      case GEN_OCL_ATOMIC_UMAX1:
+      case GEN_OCL_ATOMIC_UMIN0:
+      case GEN_OCL_ATOMIC_UMIN1:
+      case GEN_OCL_ATOMIC_IMAX0:
+      case GEN_OCL_ATOMIC_IMAX1:
+      case GEN_OCL_ATOMIC_IMIN0:
+      case GEN_OCL_ATOMIC_IMIN1:
+      case GEN_OCL_ATOMIC_INC0:
+      case GEN_OCL_ATOMIC_INC1:
+      case GEN_OCL_ATOMIC_DEC0:
+      case GEN_OCL_ATOMIC_DEC1:
+      case GEN_OCL_ATOMIC_CMPXCHG0:
+      case GEN_OCL_ATOMIC_CMPXCHG1:
+        // No structure can be returned
+        this->newRegister(&I);
+        break;
+      case GEN_OCL_FORCE_SIMD8:
+      case GEN_OCL_FORCE_SIMD16:
+      case GEN_OCL_LBARRIER:
+      case GEN_OCL_GBARRIER:
+      case GEN_OCL_LGBARRIER:
+        ctx.getFunction().setUseSLM(true);
+        break;
+      case GEN_OCL_WRITE_IMAGE_I_1D:
+      case GEN_OCL_WRITE_IMAGE_UI_1D:
+      case GEN_OCL_WRITE_IMAGE_F_1D:
+      case GEN_OCL_WRITE_IMAGE_I_2D:
+      case GEN_OCL_WRITE_IMAGE_UI_2D:
+      case GEN_OCL_WRITE_IMAGE_F_2D:
+      case GEN_OCL_WRITE_IMAGE_I_3D:
+      case GEN_OCL_WRITE_IMAGE_UI_3D:
+      case GEN_OCL_WRITE_IMAGE_F_3D:
+        break;
+      case GEN_OCL_READ_IMAGE_I_1D:
+      case GEN_OCL_READ_IMAGE_UI_1D:
+      case GEN_OCL_READ_IMAGE_F_1D:
+      case GEN_OCL_READ_IMAGE_I_2D:
+      case GEN_OCL_READ_IMAGE_UI_2D:
+      case GEN_OCL_READ_IMAGE_F_2D:
+      case GEN_OCL_READ_IMAGE_I_3D:
+      case GEN_OCL_READ_IMAGE_UI_3D:
+      case GEN_OCL_READ_IMAGE_F_3D:
+
+      case GEN_OCL_READ_IMAGE_I_1D_I:
+      case GEN_OCL_READ_IMAGE_UI_1D_I:
+      case GEN_OCL_READ_IMAGE_F_1D_I:
+      case GEN_OCL_READ_IMAGE_I_2D_I:
+      case GEN_OCL_READ_IMAGE_UI_2D_I:
+      case GEN_OCL_READ_IMAGE_F_2D_I:
+      case GEN_OCL_READ_IMAGE_I_3D_I:
+      case GEN_OCL_READ_IMAGE_UI_3D_I:
+      case GEN_OCL_READ_IMAGE_F_3D_I:
+      {
+        // dst is a 4 elements vector. We allocate all 4 registers here.
+        uint32_t elemNum;
+        (void)getVectorInfo(ctx, I.getType(), &I, elemNum);
+        GBE_ASSERT(elemNum == 4);
+        this->newRegister(&I);
+        break;
+      }
+      case GEN_OCL_MUL_HI_INT:
+      case GEN_OCL_MUL_HI_UINT:
+      case GEN_OCL_MUL_HI_I64:
+      case GEN_OCL_MUL_HI_UI64:
+      case GEN_OCL_UPSAMPLE_SHORT:
+      case GEN_OCL_UPSAMPLE_INT:
+      case GEN_OCL_UPSAMPLE_LONG:
+      case GEN_OCL_MAD:
+      case GEN_OCL_FMAX:
+      case GEN_OCL_FMIN:
+      case GEN_OCL_SADD_SAT_CHAR:
+      case GEN_OCL_SADD_SAT_SHORT:
+      case GEN_OCL_SADD_SAT_INT:
+      case GEN_OCL_SADD_SAT_LONG:
+      case GEN_OCL_UADD_SAT_CHAR:
+      case GEN_OCL_UADD_SAT_SHORT:
+      case GEN_OCL_UADD_SAT_INT:
+      case GEN_OCL_UADD_SAT_LONG:
+      case GEN_OCL_SSUB_SAT_CHAR:
+      case GEN_OCL_SSUB_SAT_SHORT:
+      case GEN_OCL_SSUB_SAT_INT:
+      case GEN_OCL_SSUB_SAT_LONG:
+      case GEN_OCL_USUB_SAT_CHAR:
+      case GEN_OCL_USUB_SAT_SHORT:
+      case GEN_OCL_USUB_SAT_INT:
+      case GEN_OCL_USUB_SAT_LONG:
+      case GEN_OCL_HADD:
+      case GEN_OCL_RHADD:
+      case GEN_OCL_I64HADD:
+      case GEN_OCL_I64RHADD:
+      case GEN_OCL_I64_MAD_SAT:
+      case GEN_OCL_I64_MAD_SATU:
+      case GEN_OCL_SAT_CONV_U8_TO_I8:
+      case GEN_OCL_SAT_CONV_I16_TO_I8:
+      case GEN_OCL_SAT_CONV_U16_TO_I8:
+      case GEN_OCL_SAT_CONV_I32_TO_I8:
+      case GEN_OCL_SAT_CONV_U32_TO_I8:
+      case GEN_OCL_SAT_CONV_F32_TO_I8:
+      case GEN_OCL_SAT_CONV_I8_TO_U8:
+      case GEN_OCL_SAT_CONV_I16_TO_U8:
+      case GEN_OCL_SAT_CONV_U16_TO_U8:
+      case GEN_OCL_SAT_CONV_I32_TO_U8:
+      case GEN_OCL_SAT_CONV_U32_TO_U8:
+      case GEN_OCL_SAT_CONV_F32_TO_U8:
+      case GEN_OCL_SAT_CONV_U16_TO_I16:
+      case GEN_OCL_SAT_CONV_I32_TO_I16:
+      case GEN_OCL_SAT_CONV_U32_TO_I16:
+      case GEN_OCL_SAT_CONV_F32_TO_I16:
+      case GEN_OCL_SAT_CONV_I16_TO_U16:
+      case GEN_OCL_SAT_CONV_I32_TO_U16:
+      case GEN_OCL_SAT_CONV_U32_TO_U16:
+      case GEN_OCL_SAT_CONV_F32_TO_U16:
+      case GEN_OCL_SAT_CONV_U32_TO_I32:
+      case GEN_OCL_SAT_CONV_F32_TO_I32:
+      case GEN_OCL_SAT_CONV_I32_TO_U32:
+      case GEN_OCL_SAT_CONV_F32_TO_U32:
+      case GEN_OCL_CONV_F16_TO_F32:
+      case GEN_OCL_CONV_F32_TO_F16:
+      case GEN_OCL_SIMD_ANY:
+      case GEN_OCL_SIMD_ALL:
+        this->newRegister(&I);
+        break;
+      case GEN_OCL_PRINTF:
+        break;
+      default:
+        GBE_ASSERTM(false, "Function call are not supported yet");
+    };
+  }
+
+  void GenWriter::emitUnaryCallInst(CallInst &I, CallSite &CS, ir::Opcode opcode) {
+    CallSite::arg_iterator AI = CS.arg_begin();
+#if GBE_DEBUG
+    CallSite::arg_iterator AE = CS.arg_end();
+#endif /* GBE_DEBUG */
+    GBE_ASSERT(AI != AE);
+    const ir::Register src = this->getRegister(*AI);
+    const ir::Register dst = this->getRegister(&I);
+    ctx.ALU1(opcode, ir::TYPE_FLOAT, dst, src);
+  }
+
+  void GenWriter::emitAtomicInst(CallInst &I, CallSite &CS, ir::AtomicOps opcode) {
+    CallSite::arg_iterator AI = CS.arg_begin();
+    CallSite::arg_iterator AE = CS.arg_end();
+    GBE_ASSERT(AI != AE);
+
+    unsigned int llvmSpace = (*AI)->getType()->getPointerAddressSpace();
+    const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
+    const ir::Register dst = this->getRegister(&I);
+
+    ir::BTI bti;
+    gatherBTI(*AI, bti);
+    vector<ir::Register> src;
+    uint32_t srcNum = 0;
+    while(AI != AE) {
+      src.push_back(this->getRegister(*(AI++)));
+      srcNum++;
+    }
+    const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], srcNum);
+    ctx.ATOMIC(opcode, dst, addrSpace, bti, srcTuple);
+  }
+
+  /* append a new sampler. should be called before any reference to
+   * a sampler_t value. */
+  uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) {
+    Constant *CPV = dyn_cast<Constant>(*AI);
+    uint8_t index;
+    if (CPV != NULL)
+    {
+      // This is not a kernel argument sampler, we need to append it to sampler set,
+      // and allocate a sampler slot for it.
+      const ir::Immediate &x = processConstantImm(CPV);
+      GBE_ASSERTM(x.getType() == ir::TYPE_U16 || x.getType() == ir::TYPE_S16, "Invalid sampler type");
+
+      index = ctx.getFunction().getSamplerSet()->append(x.getIntegerValue(), &ctx);
+    } else {
+      const ir::Register samplerReg = this->getRegister(*AI);
+      index = ctx.getFunction().getSamplerSet()->append(samplerReg, &ctx);
+    }
+    return index;
+  }
+
+  void GenWriter::emitCallInst(CallInst &I) {
+    if (Function *F = I.getCalledFunction()) {
+      if (F->getIntrinsicID() != 0) {
+        const ir::Function &fn = ctx.getFunction();
+        switch (F->getIntrinsicID()) {
+          case Intrinsic::stacksave:
+          {
+            const ir::Register dst = this->getRegister(&I);
+            const ir::Register src = ir::ocl::stackptr;
+            const ir::RegisterFamily family = fn.getRegisterFamily(dst);
+            ctx.MOV(ir::getType(family), dst, src);
+          }
+          break;
+          case Intrinsic::stackrestore:
+          {
+            const ir::Register dst = ir::ocl::stackptr;
+            const ir::Register src = this->getRegister(I.getOperand(0));
+            const ir::RegisterFamily family = fn.getRegisterFamily(dst);
+            ctx.MOV(ir::getType(family), dst, src);
+          }
+          break;
+#if LLVM_VERSION_MINOR >= 2
+          case Intrinsic::fmuladd:
+          {
+            const ir::Register tmp  = ctx.reg(ir::FAMILY_DWORD);
+            const ir::Register dst  = this->getRegister(&I);
+            const ir::Register src0 = this->getRegister(I.getOperand(0));
+            const ir::Register src1 = this->getRegister(I.getOperand(1));
+            const ir::Register src2 = this->getRegister(I.getOperand(2));
+            ctx.MUL(ir::TYPE_FLOAT, tmp, src0, src1);
+            ctx.ADD(ir::TYPE_FLOAT, dst, tmp, src2);
+            break;
+          }
+          break;
+          case Intrinsic::lifetime_start:
+          case Intrinsic::lifetime_end:
+          break;
+#endif /* LLVM_VERSION_MINOR >= 2 */
+          case Intrinsic::debugtrap:
+          case Intrinsic::dbg_value:
+          case Intrinsic::dbg_declare:
+          break;
+          default: NOT_IMPLEMENTED;
+        }
+      } else {
+        int image_dim;
+        // Get the name of the called function and handle it
+        Value *Callee = I.getCalledValue();
+        const std::string fnName = Callee->getName();
+        auto it = instrinsicMap.map.find(fnName);
+        GBE_ASSERT(it != instrinsicMap.map.end());
+
+        // Get the function arguments
+        CallSite CS(&I);
+        CallSite::arg_iterator AI = CS.arg_begin();
+#if GBE_DEBUG
+        CallSite::arg_iterator AE = CS.arg_end();
+#endif /* GBE_DEBUG */
+
+        switch (it->second) {
+          case GEN_OCL_POW:
+          {
+            const ir::Register src0 = this->getRegister(*AI); ++AI;
+            const ir::Register src1 = this->getRegister(*AI);
+            const ir::Register dst = this->getRegister(&I);
+            ctx.POW(ir::TYPE_FLOAT, dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
+          case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
+          case GEN_OCL_ABS:
+          {
+            const ir::Register src = this->getRegister(*AI);
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU1(ir::OP_ABS, ir::TYPE_S32, dst, src);
+            break;
+          }
+          case GEN_OCL_SIMD_ALL:
+          {
+            const ir::Register src = this->getRegister(*AI);
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU1(ir::OP_SIMD_ALL, ir::TYPE_S16, dst, src);
+            break;
+          }
+          case GEN_OCL_SIMD_ANY:
+          {
+            const ir::Register src = this->getRegister(*AI);
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ALU1(ir::OP_SIMD_ANY, ir::TYPE_S16, dst, src);
+            break;
+          }
+          case GEN_OCL_COS: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
+          case GEN_OCL_SIN: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
+          case GEN_OCL_LOG: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
+          case GEN_OCL_EXP: this->emitUnaryCallInst(I,CS,ir::OP_EXP); break;
+          case GEN_OCL_SQR: this->emitUnaryCallInst(I,CS,ir::OP_SQR); break;
+          case GEN_OCL_RSQ: this->emitUnaryCallInst(I,CS,ir::OP_RSQ); break;
+          case GEN_OCL_RCP: this->emitUnaryCallInst(I,CS,ir::OP_RCP); break;
+          case GEN_OCL_FABS: this->emitUnaryCallInst(I,CS,ir::OP_ABS); break;
+          case GEN_OCL_RNDZ: this->emitUnaryCallInst(I,CS,ir::OP_RNDZ); break;
+          case GEN_OCL_RNDE: this->emitUnaryCallInst(I,CS,ir::OP_RNDE); break;
+          case GEN_OCL_RNDU: this->emitUnaryCallInst(I,CS,ir::OP_RNDU); break;
+          case GEN_OCL_RNDD: this->emitUnaryCallInst(I,CS,ir::OP_RNDD); break;
+          case GEN_OCL_FORCE_SIMD8: ctx.setSimdWidth(8); break;
+          case GEN_OCL_FORCE_SIMD16: ctx.setSimdWidth(16); break;
+          case GEN_OCL_LBARRIER: ctx.SYNC(ir::syncLocalBarrier); break;
+          case GEN_OCL_GBARRIER: ctx.SYNC(ir::syncGlobalBarrier); break;
+          case GEN_OCL_LGBARRIER: ctx.SYNC(ir::syncLocalBarrier | ir::syncGlobalBarrier); break;
+          case GEN_OCL_ATOMIC_ADD0:
+          case GEN_OCL_ATOMIC_ADD1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_ADD); break;
+          case GEN_OCL_ATOMIC_SUB0:
+          case GEN_OCL_ATOMIC_SUB1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_SUB); break;
+          case GEN_OCL_ATOMIC_AND0:
+          case GEN_OCL_ATOMIC_AND1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_AND); break;
+          case GEN_OCL_ATOMIC_OR0:
+          case GEN_OCL_ATOMIC_OR1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_OR); break;
+          case GEN_OCL_ATOMIC_XOR0:
+          case GEN_OCL_ATOMIC_XOR1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_XOR); break;
+          case GEN_OCL_ATOMIC_XCHG0:
+          case GEN_OCL_ATOMIC_XCHG1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_XCHG); break;
+          case GEN_OCL_ATOMIC_INC0:
+          case GEN_OCL_ATOMIC_INC1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_INC); break;
+          case GEN_OCL_ATOMIC_DEC0:
+          case GEN_OCL_ATOMIC_DEC1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_DEC); break;
+          case GEN_OCL_ATOMIC_UMIN0:
+          case GEN_OCL_ATOMIC_UMIN1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_UMIN); break;
+          case GEN_OCL_ATOMIC_UMAX0:
+          case GEN_OCL_ATOMIC_UMAX1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_UMAX); break;
+          case GEN_OCL_ATOMIC_IMIN0:
+          case GEN_OCL_ATOMIC_IMIN1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_IMIN); break;
+          case GEN_OCL_ATOMIC_IMAX0:
+          case GEN_OCL_ATOMIC_IMAX1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_IMAX); break;
+          case GEN_OCL_ATOMIC_CMPXCHG0:
+          case GEN_OCL_ATOMIC_CMPXCHG1: this->emitAtomicInst(I,CS,ir::ATOMIC_OP_CMPXCHG); break;
+          case GEN_OCL_GET_IMAGE_WIDTH:
+          case GEN_OCL_GET_IMAGE_HEIGHT:
+          case GEN_OCL_GET_IMAGE_DEPTH:
+          case GEN_OCL_GET_IMAGE_CHANNEL_DATA_TYPE:
+          case GEN_OCL_GET_IMAGE_CHANNEL_ORDER:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
+            const ir::Register reg = this->getRegister(&I, 0);
+            int infoType = it->second - GEN_OCL_GET_IMAGE_WIDTH;
+            const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
+            ir::ImageInfoKey key(surfaceID, infoType);
+            const ir::Register infoReg = ctx.getFunction().getImageSet()->appendInfo(key, &ctx);
+            ctx.GET_IMAGE_INFO(infoType, reg, surfaceID, infoReg);
+            break;
+          }
+
+          case GEN_OCL_READ_IMAGE_I_1D:
+          case GEN_OCL_READ_IMAGE_UI_1D:
+          case GEN_OCL_READ_IMAGE_F_1D:
+          case GEN_OCL_READ_IMAGE_I_1D_I:
+          case GEN_OCL_READ_IMAGE_UI_1D_I:
+          case GEN_OCL_READ_IMAGE_F_1D_I:
+            image_dim = 1;
+            goto handle_read_image;
+          case GEN_OCL_READ_IMAGE_I_2D:
+          case GEN_OCL_READ_IMAGE_UI_2D:
+          case GEN_OCL_READ_IMAGE_F_2D:
+          case GEN_OCL_READ_IMAGE_I_2D_I:
+          case GEN_OCL_READ_IMAGE_UI_2D_I:
+          case GEN_OCL_READ_IMAGE_F_2D_I:
+            image_dim = 2;
+            goto handle_read_image;
+          case GEN_OCL_READ_IMAGE_I_3D:
+          case GEN_OCL_READ_IMAGE_UI_3D:
+          case GEN_OCL_READ_IMAGE_F_3D:
+          case GEN_OCL_READ_IMAGE_I_3D_I:
+          case GEN_OCL_READ_IMAGE_UI_3D_I:
+          case GEN_OCL_READ_IMAGE_F_3D_I:
+            image_dim = 3;
+handle_read_image:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
+            const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
+            GBE_ASSERT(AI != AE);
+            const uint8_t sampler = this->appendSampler(AI);
+            ++AI;
+
+            ir::Register ucoord;
+            ir::Register vcoord;
+            ir::Register wcoord;
+
+            GBE_ASSERT(AI != AE); ucoord = this->getRegister(*AI); ++AI;
+            if (image_dim > 1) {
+              GBE_ASSERT(AI != AE);
+              vcoord = this->getRegister(*AI);
+              ++AI;
+            } else {
+              vcoord = ir::ocl::invalid;
+            }
+
+            if (image_dim > 2) {
+              GBE_ASSERT(AI != AE);
+              wcoord = this->getRegister(*AI);
+              ++AI;
+            } else {
+              wcoord = ir::ocl::invalid;
+            }
+
+            vector<ir::Register> dstTupleData, srcTupleData;
+            const uint32_t elemNum = 4;
+            for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
+              const ir::Register reg = this->getRegister(&I, elemID);
+              dstTupleData.push_back(reg);
+            }
+            srcTupleData.push_back(ucoord);
+            srcTupleData.push_back(vcoord);
+            srcTupleData.push_back(wcoord);
+            uint8_t samplerOffset = 0;
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+            GBE_ASSERT(AI != AE); Constant *CPV = dyn_cast<Constant>(*AI);
+            assert(CPV);
+            const ir::Immediate &x = processConstantImm(CPV);
+            GBE_ASSERTM(x.getType() == ir::TYPE_U32 || x.getType() == ir::TYPE_S32, "Invalid sampler type");
+            samplerOffset = x.getIntegerValue();
+#endif
+            const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], elemNum);
+            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 3);
+
+            ir::Type dstType = ir::TYPE_U32;
+
+            switch(it->second) {
+              case GEN_OCL_READ_IMAGE_I_1D:
+              case GEN_OCL_READ_IMAGE_UI_1D:
+              case GEN_OCL_READ_IMAGE_I_2D:
+              case GEN_OCL_READ_IMAGE_UI_2D:
+              case GEN_OCL_READ_IMAGE_I_3D:
+              case GEN_OCL_READ_IMAGE_UI_3D:
+              case GEN_OCL_READ_IMAGE_I_1D_I:
+              case GEN_OCL_READ_IMAGE_UI_1D_I:
+              case GEN_OCL_READ_IMAGE_I_2D_I:
+              case GEN_OCL_READ_IMAGE_UI_2D_I:
+              case GEN_OCL_READ_IMAGE_I_3D_I:
+              case GEN_OCL_READ_IMAGE_UI_3D_I:
+                dstType = ir::TYPE_U32;
+                break;
+              case GEN_OCL_READ_IMAGE_F_1D:
+              case GEN_OCL_READ_IMAGE_F_2D:
+              case GEN_OCL_READ_IMAGE_F_3D:
+              case GEN_OCL_READ_IMAGE_F_1D_I:
+              case GEN_OCL_READ_IMAGE_F_2D_I:
+              case GEN_OCL_READ_IMAGE_F_3D_I:
+                dstType = ir::TYPE_FLOAT;
+                break;
+              default:
+                GBE_ASSERT(0); // never been here.
+            }
+
+            bool isFloatCoord = it->second <= GEN_OCL_READ_IMAGE_F_3D;
+
+            ctx.SAMPLE(surfaceID, dstTuple, srcTuple, dstType == ir::TYPE_FLOAT,
+                       isFloatCoord, sampler, samplerOffset);
+            break;
+          }
+
+          case GEN_OCL_WRITE_IMAGE_I_1D:
+          case GEN_OCL_WRITE_IMAGE_UI_1D:
+          case GEN_OCL_WRITE_IMAGE_F_1D:
+            image_dim = 1;
+            goto handle_write_image;
+          case GEN_OCL_WRITE_IMAGE_I_2D:
+          case GEN_OCL_WRITE_IMAGE_UI_2D:
+          case GEN_OCL_WRITE_IMAGE_F_2D:
+            image_dim = 2;
+            goto handle_write_image;
+          case GEN_OCL_WRITE_IMAGE_I_3D:
+          case GEN_OCL_WRITE_IMAGE_UI_3D:
+          case GEN_OCL_WRITE_IMAGE_F_3D:
+            image_dim = 3;
+handle_write_image:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register surfaceReg = this->getRegister(*AI); ++AI;
+            const uint8_t surfaceID = ctx.getFunction().getImageSet()->getIdx(surfaceReg);
+            ir::Register ucoord, vcoord, wcoord;
+
+            GBE_ASSERT(AI != AE); ucoord = this->getRegister(*AI); ++AI;
+
+            if (image_dim > 1) {
+              GBE_ASSERT(AI != AE);
+              vcoord = this->getRegister(*AI);
+              ++AI;
+            } else
+              vcoord = ir::ocl::invalid;
+
+            if (image_dim > 2) {
+              GBE_ASSERT(AI != AE);
+              wcoord = this->getRegister(*AI);
+              ++AI;
+            } else {
+              wcoord = ir::ocl::invalid;
+            }
+
+            GBE_ASSERT(AI != AE);
+            vector<ir::Register> srcTupleData;
+
+            srcTupleData.push_back(ucoord);
+            srcTupleData.push_back(vcoord);
+            srcTupleData.push_back(wcoord);
+
+            const uint32_t elemNum = 4;
+            for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
+              const ir::Register reg = this->getRegister(*AI, elemID);
+              srcTupleData.push_back(reg);
+            }
+            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 7);
+
+            ir::Type srcType = ir::TYPE_U32;
+
+            switch(it->second) {
+              case GEN_OCL_WRITE_IMAGE_I_1D:
+              case GEN_OCL_WRITE_IMAGE_UI_1D:
+              case GEN_OCL_WRITE_IMAGE_I_2D:
+              case GEN_OCL_WRITE_IMAGE_UI_2D:
+              case GEN_OCL_WRITE_IMAGE_I_3D:
+              case GEN_OCL_WRITE_IMAGE_UI_3D:
+                srcType = ir::TYPE_U32;
+                break;
+              case GEN_OCL_WRITE_IMAGE_F_1D:
+              case GEN_OCL_WRITE_IMAGE_F_2D:
+              case GEN_OCL_WRITE_IMAGE_F_3D:
+                srcType = ir::TYPE_FLOAT;
+                break;
+              default:
+                GBE_ASSERT(0); // never been here.
+            }
+
+            ctx.TYPED_WRITE(surfaceID, srcTuple, srcType, ir::TYPE_U32);
+            break;
+          }
+          case GEN_OCL_MUL_HI_INT:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.MUL_HI(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_MUL_HI_UINT:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.MUL_HI(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_MUL_HI_I64:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.I64_MUL_HI(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_MUL_HI_UI64:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.I64_MUL_HI(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_UPSAMPLE_SHORT:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.UPSAMPLE_SHORT(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_UPSAMPLE_INT:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.UPSAMPLE_INT(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_UPSAMPLE_LONG:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.UPSAMPLE_LONG(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_SADD_SAT_CHAR:
+          case GEN_OCL_SADD_SAT_SHORT:
+          case GEN_OCL_SADD_SAT_INT:
+          case GEN_OCL_SADD_SAT_LONG:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ADDSAT(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_UADD_SAT_CHAR:
+          case GEN_OCL_UADD_SAT_SHORT:
+          case GEN_OCL_UADD_SAT_INT:
+          case GEN_OCL_UADD_SAT_LONG:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.ADDSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_SSUB_SAT_CHAR:
+          case GEN_OCL_SSUB_SAT_SHORT:
+          case GEN_OCL_SSUB_SAT_INT:
+          case GEN_OCL_SSUB_SAT_LONG:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.SUBSAT(getType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_USUB_SAT_CHAR:
+          case GEN_OCL_USUB_SAT_SHORT:
+          case GEN_OCL_USUB_SAT_INT:
+          case GEN_OCL_USUB_SAT_LONG:
+          {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.SUBSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_I64_MAD_SAT:
+           {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.I64MADSAT(getType(ctx, I.getType()), dst, src0, src1, src2);
+            break;
+           }
+          case GEN_OCL_I64_MAD_SATU:
+           {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.I64MADSAT(getUnsignedType(ctx, I.getType()), dst, src0, src1, src2);
+            break;
+           }
+          case GEN_OCL_MAD: {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src2 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.MAD(getType(ctx, I.getType()), dst, src0, src1, src2);
+            break;
+          }
+          case GEN_OCL_FMAX:
+          case GEN_OCL_FMIN:{
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            const ir::Register cmp = ctx.reg(ir::FAMILY_BOOL);
+            //Becasue cmp's sources are same as sel's source, so cmp instruction and sel
+            //instruction will be merged to one sel_cmp instruction in the gen selection
+            //Add two intruction here for simple.
+            if(it->second == GEN_OCL_FMAX)
+              ctx.GE(getType(ctx, I.getType()), cmp, src0, src1);
+            else
+              ctx.LT(getType(ctx, I.getType()), cmp, src0, src1);
+            ctx.SEL(getType(ctx, I.getType()), dst, cmp, src0, src1);
+            break;
+          }
+          case GEN_OCL_HADD: {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.HADD(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_I64HADD:
+           {
+            GBE_ASSERT(AI != AE);
+            const ir::Register src0 = this->getRegister(*(AI++));
+            GBE_ASSERT(AI != AE);
+            const ir::Register src1 = this->getRegister(*(AI++));
+            const ir::Register dst = this->getRegister(&I);
+            ctx.I64HADD(ir::TYPE_U64, dst, src0, src1);
+            break;
+           }
+          case GEN_OCL_RHADD: {
+            GBE_ASSERT(AI != AE); const ir::Register src0 = this->getRegister(*AI); ++AI;
+            GBE_ASSERT(AI != AE); const ir::Register src1 = this->getRegister(*AI); ++AI;
+            const ir::Register dst = this->getRegister(&I);
+            ctx.RHADD(getUnsignedType(ctx, I.getType()), dst, src0, src1);
+            break;
+          }
+          case GEN_OCL_I64RHADD:
+           {
+            GBE_ASSERT(AI != AE);
+            const ir::Register src0 = this->getRegister(*(AI++));
+            GBE_ASSERT(AI != AE);
+            const ir::Register src1 = this->getRegister(*(AI++));
+            const ir::Register dst = this->getRegister(&I);
+            ctx.I64RHADD(ir::TYPE_U64, dst, src0, src1);
+            break;
+           }
+#define DEF(DST_TYPE, SRC_TYPE) \
+  { ctx.SAT_CVT(DST_TYPE, SRC_TYPE, getRegister(&I), getRegister(I.getOperand(0))); break; }
+          case GEN_OCL_SAT_CONV_U8_TO_I8:
+            DEF(ir::TYPE_S8, ir::TYPE_U8);
+          case GEN_OCL_SAT_CONV_I16_TO_I8:
+            DEF(ir::TYPE_S8, ir::TYPE_S16);
+          case GEN_OCL_SAT_CONV_U16_TO_I8:
+            DEF(ir::TYPE_S8, ir::TYPE_U16);
+          case GEN_OCL_SAT_CONV_I32_TO_I8:
+            DEF(ir::TYPE_S8, ir::TYPE_S32);
+          case GEN_OCL_SAT_CONV_U32_TO_I8:
+            DEF(ir::TYPE_S8, ir::TYPE_U32);
+          case GEN_OCL_SAT_CONV_F32_TO_I8:
+            DEF(ir::TYPE_S8, ir::TYPE_FLOAT);
+          case GEN_OCL_SAT_CONV_I8_TO_U8:
+            DEF(ir::TYPE_U8, ir::TYPE_S8);
+          case GEN_OCL_SAT_CONV_I16_TO_U8:
+            DEF(ir::TYPE_U8, ir::TYPE_S16);
+          case GEN_OCL_SAT_CONV_U16_TO_U8:
+            DEF(ir::TYPE_U8, ir::TYPE_U16);
+          case GEN_OCL_SAT_CONV_I32_TO_U8:
+            DEF(ir::TYPE_U8, ir::TYPE_S32);
+          case GEN_OCL_SAT_CONV_U32_TO_U8:
+            DEF(ir::TYPE_U8, ir::TYPE_U32);
+          case GEN_OCL_SAT_CONV_F32_TO_U8:
+            DEF(ir::TYPE_U8, ir::TYPE_FLOAT);
+          case GEN_OCL_SAT_CONV_U16_TO_I16:
+            DEF(ir::TYPE_S16, ir::TYPE_U16);
+          case GEN_OCL_SAT_CONV_I32_TO_I16:
+            DEF(ir::TYPE_S16, ir::TYPE_S32);
+          case GEN_OCL_SAT_CONV_U32_TO_I16:
+            DEF(ir::TYPE_S16, ir::TYPE_U32);
+          case GEN_OCL_SAT_CONV_F32_TO_I16:
+            DEF(ir::TYPE_S16, ir::TYPE_FLOAT);
+          case GEN_OCL_SAT_CONV_I16_TO_U16:
+            DEF(ir::TYPE_U16, ir::TYPE_S16);
+          case GEN_OCL_SAT_CONV_I32_TO_U16:
+            DEF(ir::TYPE_U16, ir::TYPE_S32);
+          case GEN_OCL_SAT_CONV_U32_TO_U16:
+            DEF(ir::TYPE_U16, ir::TYPE_U32);
+          case GEN_OCL_SAT_CONV_F32_TO_U16:
+            DEF(ir::TYPE_U16, ir::TYPE_FLOAT);
+          case GEN_OCL_SAT_CONV_U32_TO_I32:
+            DEF(ir::TYPE_S32, ir::TYPE_U32);
+          case GEN_OCL_SAT_CONV_F32_TO_I32:
+            DEF(ir::TYPE_S32, ir::TYPE_FLOAT);
+          case GEN_OCL_SAT_CONV_I32_TO_U32:
+            DEF(ir::TYPE_U32, ir::TYPE_S32);
+          case GEN_OCL_SAT_CONV_F32_TO_U32:
+            DEF(ir::TYPE_U32, ir::TYPE_FLOAT);
+          case GEN_OCL_CONV_F16_TO_F32:
+            ctx.F16TO32(ir::TYPE_FLOAT, ir::TYPE_U16, getRegister(&I), getRegister(I.getOperand(0)));
+            break;
+          case GEN_OCL_CONV_F32_TO_F16:
+            ctx.F32TO16(ir::TYPE_U16, ir::TYPE_FLOAT, getRegister(&I), getRegister(I.getOperand(0)));
+            break;
+#undef DEF
+
+          case GEN_OCL_PRINTF:
+          {
+            ir::PrintfSet::PrintfFmt* fmt = (ir::PrintfSet::PrintfFmt*)getPrintfInfo(&I);
+            ctx.getFunction().getPrintfSet()->append(fmt, unit);
+            assert(fmt);
+            break;
+          }
+          case GEN_OCL_PRINTF_BUF_ADDR:
+          case GEN_OCL_PRINTF_INDEX_BUF_ADDR:
+          default: break;
+        }
+      }
+    }
+  }
+
+  void GenWriter::regAllocateAllocaInst(AllocaInst &I) {
+    this->newRegister(&I);
+  }
+  void GenWriter::emitAllocaInst(AllocaInst &I) {
+    Value *src = I.getOperand(0);
+    Type *elemType = I.getType()->getElementType();
+    ir::ImmediateIndex immIndex;
+    uint32_t elementSize = getTypeByteSize(unit, elemType);
+
+    // Be aware, we manipulate pointers
+    if (ctx.getPointerSize() == ir::POINTER_32_BITS)
+      immIndex = ctx.newImmediate(uint32_t(elementSize));
+    else
+      immIndex = ctx.newImmediate(uint64_t(elementSize));
+
+    // OK, we try to see if we know compile time the size we need to allocate
+    if (I.isArrayAllocation() == true) {
+      Constant *CPV = dyn_cast<Constant>(src);
+      GBE_ASSERT(CPV);
+      const ir::Immediate &imm = processConstantImm(CPV);
+      const uint64_t elemNum = imm.getIntegerValue();
+      elementSize *= elemNum;
+      if (ctx.getPointerSize() == ir::POINTER_32_BITS)
+        immIndex = ctx.newImmediate(uint32_t(ALIGN(elementSize, 4)));
+      else
+        immIndex = ctx.newImmediate(uint64_t(ALIGN(elementSize, 4)));
+    }
+
+    // Now emit the stream of instructions to get the allocated pointer
+    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+    const ir::Register dst = this->getRegister(&I);
+    const ir::Register stack = ir::ocl::stackptr;
+    const ir::Register reg = ctx.reg(pointerFamily);
+    const ir::Immediate imm = ctx.getImmediate(immIndex);
+    uint32_t align = getAlignmentByte(unit, elemType);
+    // below code assume align is power of 2
+    GBE_ASSERT(align && (align & (align-1)) == 0);
+
+    // align the stack pointer according to data alignment
+    if(align > 1) {
+      uint32_t prevStackPtr = ctx.getFunction().getStackSize();
+      uint32_t step = ((prevStackPtr + (align - 1)) & ~(align - 1)) - prevStackPtr;
+      if (step != 0) {
+        ir::ImmediateIndex stepImm = ctx.newIntegerImmediate(step, ir::TYPE_U32);
+        ir::Register stepReg = ctx.reg(ctx.getPointerFamily());
+        ctx.LOADI(ir::TYPE_S32, stepReg, stepImm);
+        ctx.ADD(ir::TYPE_U32, stack, stack, stepReg);
+        ctx.getFunction().pushStackSize(step);
+      }
+    }
+    // Set the destination register properly
+    ctx.MOV(imm.getType(), dst, stack);
+
+    ctx.LOADI(imm.getType(), reg, immIndex);
+    ctx.ADD(imm.getType(), stack, stack, reg);
+    ctx.getFunction().pushStackSize(elementSize);
+  }
+
+  static INLINE Value *getLoadOrStoreValue(LoadInst &I) {
+    return &I;
+  }
+  static INLINE Value *getLoadOrStoreValue(StoreInst &I) {
+    return I.getValueOperand();
+  }
+  void GenWriter::regAllocateLoadInst(LoadInst &I) {
+    this->newRegister(&I);
+  }
+  void GenWriter::regAllocateStoreInst(StoreInst &I) {}
+
+  void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
+                                      Value *llvmValues, const ir::Register ptr,
+                                      const ir::AddressSpace addrSpace,
+                                      Type * elemType, bool isLoad, ir::BTI bti) {
+    const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+    uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
+    uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
+    const uint32_t perMsgNum = elemNum / msgNum;
+
+    for (uint32_t msg = 0; msg < msgNum; ++msg) {
+      // Build the tuple data in the vector
+      vector<ir::Register> tupleData; // put registers here
+      for (uint32_t elemID = 0; elemID < perMsgNum; ++elemID) {
+        ir::Register reg;
+        if(regTranslator.isUndefConst(llvmValues, elemID)) {
+          Value *v = Constant::getNullValue(elemType);
+          reg = this->getRegister(v);
+        } else
+          reg = this->getRegister(llvmValues, perMsgNum*msg+elemID);
+
+        tupleData.push_back(reg);
+      }
+      const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], perMsgNum);
+
+      // We may need to update to offset the pointer
+      ir::Register addr;
+      if (msg == 0)
+        addr = ptr;
+      else {
+        const ir::Register offset = ctx.reg(pointerFamily);
+        ir::ImmediateIndex immIndex;
+        ir::Type immType;
+        // each message can read/write 16 byte
+        const int32_t stride = 16;
+        if (pointerFamily == ir::FAMILY_DWORD) {
+          immIndex = ctx.newImmediate(int32_t(msg*stride));
+          immType = ir::TYPE_S32;
+        } else {
+          immIndex = ctx.newImmediate(int64_t(msg*stride));
+          immType = ir::TYPE_S64;
+        }
+
+        addr = ctx.reg(pointerFamily);
+        ctx.LOADI(immType, offset, immIndex);
+        ctx.ADD(immType, addr, ptr, offset);
+      }
+
+      // Emit the instruction
+      if (isLoad)
+        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true, bti);
+      else
+        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true, bti);
+    }
+  }
+
+  // The idea behind is to search along the use-def chain, and find out all
+  // possible source of the pointer. Then in later codeGen, we can emit
+  // read/store instructions to these btis gathered.
+  void GenWriter::gatherBTI(Value *pointer, ir::BTI &bti) {
+    typedef map<const Value*, int>::iterator GlobalPtrIter;
+    Value *p;
+    size_t idx = 0;
+    int nBTI = 0;
+    std::vector<Value*> candidates;
+    candidates.push_back(pointer);
+    std::set<Value*> processed;
+
+    while (idx < candidates.size()) {
+      bool isPrivate = false;
+      bool needNewBTI = true;
+      p = candidates[idx];
+
+      while (dyn_cast<User>(p) && !dyn_cast<GlobalVariable>(p)) {
+
+        if (processed.find(p) == processed.end()) {
+          processed.insert(p);
+        } else {
+          // This use-def chain falls into a loop,
+          // it does not introduce a new buffer source.
+          needNewBTI = false;
+          break;
+        }
+
+        if (dyn_cast<SelectInst>(p)) {
+          SelectInst *sel = cast<SelectInst>(p);
+          p = sel->getTrueValue();
+          candidates.push_back(sel->getFalseValue());
+          continue;
+        }
+
+        if (dyn_cast<PHINode>(p)) {
+          PHINode* phi = cast<PHINode>(p);
+          int n = phi->getNumIncomingValues();
+          for (int j = 1; j < n; j++)
+            candidates.push_back(phi->getIncomingValue(j));
+          p = phi->getIncomingValue(0);
+          continue;
+        }
+
+        if (dyn_cast<AllocaInst>(p)) {
+          isPrivate = true;
+          break;
+        }
+        p = cast<User>(p)->getOperand(0);
+      }
+
+      if (needNewBTI == false) {
+        // go to next possible pointer source
+        idx++; continue;
+      }
+
+      uint8_t new_bti = 0;
+      if (isPrivate) {
+        new_bti = BTI_PRIVATE;
+      } else {
+        if(isa<Argument>(p) && dyn_cast<Argument>(p)->hasByValAttr()) {
+          // structure value implementation is not complete now,
+          // they are now treated as push constant, so, the load/store
+          // here is not as meaningful.
+          bti.bti[0] = BTI_PRIVATE;
+          bti.count = 1;
+          break;
+        }
+        Type *ty = p->getType();
+        if(ty->getPointerAddressSpace() == 3) {
+          // __local memory
+          new_bti = 0xfe;
+        } else {
+          // __global memory
+          GlobalPtrIter iter = globalPointer.find(p);
+          GBE_ASSERT(iter != globalPointer.end());
+          new_bti = iter->second;
+        }
+      }
+      // avoid duplicate
+      bool bFound = false;
+      for (int j = 0; j < nBTI; j++) {
+        if (bti.bti[j] == new_bti) {
+          bFound = true; break;
+        }
+      }
+      if (bFound == false) {
+        bti.bti[nBTI++] = new_bti;
+        bti.count = nBTI;
+      }
+      idx++;
+    }
+    GBE_ASSERT(bti.count <= MAX_MIXED_POINTER);
+  }
+
+  extern int OCL_SIMD_WIDTH;
+  template <bool isLoad, typename T>
+  INLINE void GenWriter::emitLoadOrStore(T &I)
+  {
+    unsigned int llvmSpace = I.getPointerAddressSpace();
+    Value *llvmPtr = I.getPointerOperand();
+    Value *llvmValues = getLoadOrStoreValue(I);
+    Type *llvmType = llvmValues->getType();
+    const bool dwAligned = (I.getAlignment() % 4) == 0;
+    const ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmSpace);
+    const ir::Register ptr = this->getRegister(llvmPtr);
+    ir::BTI binding;
+    if(addrSpace == ir::MEM_GLOBAL || addrSpace == ir::MEM_PRIVATE) {
+      gatherBTI(llvmPtr, binding);
+    }
+    // Scalar is easy. We neednot build register tuples
+    if (isScalarType(llvmType) == true) {
+      const ir::Type type = getType(ctx, llvmType);
+      const ir::Register values = this->getRegister(llvmValues);
+      if (isLoad)
+        ctx.LOAD(type, ptr, addrSpace, dwAligned, binding, values);
+      else
+        ctx.STORE(type, ptr, addrSpace, dwAligned, binding, values);
+    }
+    // A vector type requires to build a tuple
+    else {
+      VectorType *vectorType = cast<VectorType>(llvmType);
+      Type *elemType = vectorType->getElementType();
+
+      // We follow OCL spec and support 2,3,4,8,16 elements only
+      uint32_t elemNum = vectorType->getNumElements();
+      GBE_ASSERTM(elemNum == 2 || elemNum == 3 || elemNum == 4 || elemNum == 8 || elemNum == 16,
+                  "Only vectors of 2,3,4,8 or 16 elements are supported");
+      // Per OPenCL 1.2 spec 6.1.5:
+      //   For 3-component vector data types, the size of the data type is 4 * sizeof(component).
+      // And the llvm does cast a type3 data to type4 for load/store instruction,
+      // so a 4 elements vector may only have 3 valid elements. We need to fix it to correct element
+      // count here.
+      if (elemNum == 4 && regTranslator.isUndefConst(llvmValues, 3))
+          elemNum = 3;
+
+      // The code is going to be fairly different from types to types (based on
+      // size of each vector element)
+      const ir::Type type = getType(ctx, elemType);
+      const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
+      const ir::RegisterFamily dataFamily = getFamily(type);
+
+      if(dataFamily == ir::FAMILY_DWORD && addrSpace != ir::MEM_CONSTANT) {
+        // One message is enough here. Nothing special to do
+        if (elemNum <= 4) {
+          // Build the tuple data in the vector
+          vector<ir::Register> tupleData; // put registers here
+          for (uint32_t elemID = 0; elemID < elemNum; ++elemID) {
+            ir::Register reg;
+            if(regTranslator.isUndefConst(llvmValues, elemID)) {
+              Value *v = Constant::getNullValue(elemType);
+              reg = this->getRegister(v);
+            } else
+              reg = this->getRegister(llvmValues, elemID);
+
+            tupleData.push_back(reg);
+          }
+          const ir::Tuple tuple = ctx.arrayTuple(&tupleData[0], elemNum);
+
+          // Emit the instruction
+          if (isLoad)
+            ctx.LOAD(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
+          else
+            ctx.STORE(type, tuple, ptr, addrSpace, elemNum, dwAligned, binding);
+        }
+        // Not supported by the hardware. So, we split the message and we use
+        // strided loads and stores
+        else {
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
+        }
+      }
+      else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) {
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
+      } else {
+        for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
+          if(regTranslator.isUndefConst(llvmValues, elemID))
+            continue;
+
+          const ir::Register reg = this->getRegister(llvmValues, elemID);
+          ir::Register addr;
+          if (elemID == 0)
+            addr = ptr;
+          else {
+              const ir::Register offset = ctx.reg(pointerFamily);
+              ir::ImmediateIndex immIndex;
+              int elemSize = getTypeByteSize(unit, elemType);
+              immIndex = ctx.newImmediate(int32_t(elemID * elemSize));
+              addr = ctx.reg(pointerFamily);
+              ctx.LOADI(ir::TYPE_S32, offset, immIndex);
+              ctx.ADD(ir::TYPE_S32, addr, ptr, offset);
+          }
+          if (isLoad)
+           ctx.LOAD(type, addr, addrSpace, dwAligned, binding, reg);
+          else
+           ctx.STORE(type, addr, addrSpace, dwAligned, binding, reg);
+        }
+      }
+    }
+  }
+
+  void GenWriter::emitLoadInst(LoadInst &I) {
+    this->emitLoadOrStore<true>(I);
+  }
+
+  void GenWriter::emitStoreInst(StoreInst &I) {
+    this->emitLoadOrStore<false>(I);
+  }
+
+  llvm::FunctionPass *createGenPass(ir::Unit &unit) {
+    return new GenWriter(unit);
+  }
+} /* namespace gbe */
+
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
new file mode 100644
index 0000000..cc5cdad
--- /dev/null
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file llvm_gen_backend.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * Pass generation functions
+ */
+#ifndef __GBE_LLVM_GEN_BACKEND_HPP__
+#define __GBE_LLVM_GEN_BACKEND_HPP__
+
+#include "llvm/Pass.h"
+#include "sys/platform.hpp"
+#include "sys/map.hpp"
+#include "sys/hash_map.hpp"
+#include <algorithm>
+
+// LLVM Type
+namespace llvm { class Type; }
+
+namespace gbe
+{
+  // Final target of the Gen backend
+  namespace ir { class Unit; }
+
+  /*! All intrinsic Gen functions */
+  enum OCLInstrinsic {
+#define DECL_LLVM_GEN_FUNCTION(ID, NAME) GEN_OCL_##ID,
+#include "llvm_gen_ocl_function.hxx"
+#undef DECL_LLVM_GEN_FUNCTION
+  };
+
+  /*! Build the hash map for OCL functions on Gen */
+  struct OCLIntrinsicMap {
+    /*! Build the intrinsic hash map */
+    OCLIntrinsicMap(void) {
+#define DECL_LLVM_GEN_FUNCTION(ID, NAME) \
+  map.insert(std::make_pair(#NAME, GEN_OCL_##ID));
+#include "llvm_gen_ocl_function.hxx"
+#undef DECL_LLVM_GEN_FUNCTION
+    }
+    /*! Sort intrinsics with their names */
+    hash_map<std::string, OCLInstrinsic> map;
+  };
+
+  /*! Sort the OCL Gen instrinsic functions (built on pre-main) */
+  static const OCLIntrinsicMap instrinsicMap;
+
+  /*! Pad the offset */
+  uint32_t getPadding(uint32_t offset, uint32_t align);
+
+  /*! Get the type alignment in bytes */
+  uint32_t getAlignmentByte(const ir::Unit &unit, llvm::Type* Ty);
+
+  /*! Get the type size in bits */
+  uint32_t getTypeBitSize(const ir::Unit &unit, llvm::Type* Ty);
+
+  /*! Get the type size in bytes */
+  uint32_t getTypeByteSize(const ir::Unit &unit, llvm::Type* Ty);
+
+  /*! whether this is a kernel function */
+  bool isKernelFunction(const llvm::Function &f);
+
+  /*! Create a Gen-IR unit */
+  llvm::FunctionPass *createGenPass(ir::Unit &unit);
+
+  /*! Remove the GEP instructions */
+  llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit);
+
+  /*! Merge load/store if possible */
+  llvm::BasicBlockPass *createLoadStoreOptimizationPass();
+
+  /*! Scalarize all vector op instructions */
+  llvm::FunctionPass* createScalarizePass();
+  /*! Remove/add NoDuplicate function attribute for barrier functions. */
+  llvm::ModulePass* createBarrierNodupPass(bool);
+
+  /*! Convert the Intrinsic call to gen function */
+  llvm::BasicBlockPass *createIntrinsicLoweringPass();
+
+  /*! Passer the printf function call. */
+  llvm::FunctionPass* createPrintfParserPass();
+
+  void* getPrintfInfo(llvm::CallInst* inst);
+} /* namespace gbe */
+
+#endif /* __GBE_LLVM_GEN_BACKEND_HPP__ */
+
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
new file mode 100644
index 0000000..f3ce096
--- /dev/null
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -0,0 +1,196 @@
+DECL_LLVM_GEN_FUNCTION(GET_GROUP_ID0, __gen_ocl_get_group_id0)
+DECL_LLVM_GEN_FUNCTION(GET_GROUP_ID1, __gen_ocl_get_group_id1)
+DECL_LLVM_GEN_FUNCTION(GET_GROUP_ID2, __gen_ocl_get_group_id2)
+DECL_LLVM_GEN_FUNCTION(GET_LOCAL_ID0, __gen_ocl_get_local_id0)
+DECL_LLVM_GEN_FUNCTION(GET_LOCAL_ID1, __gen_ocl_get_local_id1)
+DECL_LLVM_GEN_FUNCTION(GET_LOCAL_ID2, __gen_ocl_get_local_id2)
+DECL_LLVM_GEN_FUNCTION(GET_NUM_GROUPS0, __gen_ocl_get_num_groups0)
+DECL_LLVM_GEN_FUNCTION(GET_NUM_GROUPS1, __gen_ocl_get_num_groups1)
+DECL_LLVM_GEN_FUNCTION(GET_NUM_GROUPS2, __gen_ocl_get_num_groups2)
+DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE0, __gen_ocl_get_local_size0)
+DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE1, __gen_ocl_get_local_size1)
+DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE2, __gen_ocl_get_local_size2)
+DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE0, __gen_ocl_get_global_size0)
+DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE1, __gen_ocl_get_global_size1)
+DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE2, __gen_ocl_get_global_size2)
+DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET0, __gen_ocl_get_global_offset0)
+DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET1, __gen_ocl_get_global_offset1)
+DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET2, __gen_ocl_get_global_offset2)
+DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
+
+// Math function
+DECL_LLVM_GEN_FUNCTION(FABS, __gen_ocl_fabs)
+DECL_LLVM_GEN_FUNCTION(COS, __gen_ocl_cos)
+DECL_LLVM_GEN_FUNCTION(SIN, __gen_ocl_sin)
+DECL_LLVM_GEN_FUNCTION(SQR, __gen_ocl_sqrt)
+DECL_LLVM_GEN_FUNCTION(RSQ, __gen_ocl_rsqrt)
+DECL_LLVM_GEN_FUNCTION(LOG, __gen_ocl_log)
+DECL_LLVM_GEN_FUNCTION(EXP, __gen_ocl_exp)
+DECL_LLVM_GEN_FUNCTION(POW, __gen_ocl_pow)
+DECL_LLVM_GEN_FUNCTION(RCP, __gen_ocl_rcp)
+DECL_LLVM_GEN_FUNCTION(RNDZ, __gen_ocl_rndz)
+DECL_LLVM_GEN_FUNCTION(RNDE, __gen_ocl_rnde)
+DECL_LLVM_GEN_FUNCTION(RNDU, __gen_ocl_rndu)
+DECL_LLVM_GEN_FUNCTION(RNDD, __gen_ocl_rndd)
+DECL_LLVM_GEN_FUNCTION(MAD, __gen_ocl_mad)
+DECL_LLVM_GEN_FUNCTION(FMAX, __gen_ocl_fmax)
+DECL_LLVM_GEN_FUNCTION(FMIN, __gen_ocl_fmin)
+
+// Barrier function
+DECL_LLVM_GEN_FUNCTION(LBARRIER, __gen_ocl_barrier_local)
+DECL_LLVM_GEN_FUNCTION(GBARRIER, __gen_ocl_barrier_global)
+DECL_LLVM_GEN_FUNCTION(LGBARRIER, __gen_ocl_barrier_local_and_global)
+
+// To force SIMD8/16 compilation
+DECL_LLVM_GEN_FUNCTION(FORCE_SIMD8,  __gen_ocl_force_simd8)
+DECL_LLVM_GEN_FUNCTION(FORCE_SIMD16, __gen_ocl_force_simd16)
+
+// To read_image functions.
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_1D, _Z21__gen_ocl_read_imageijtfj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_1D, _Z22__gen_ocl_read_imageuijtfj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_1D, _Z21__gen_ocl_read_imagefjtfj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_2D, _Z21__gen_ocl_read_imageijtffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_2D, _Z22__gen_ocl_read_imageuijtffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_2D, _Z21__gen_ocl_read_imagefjtffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_3D, _Z21__gen_ocl_read_imageijtfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_3D, _Z22__gen_ocl_read_imageuijtfffj)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_3D, _Z21__gen_ocl_read_imagefjtfffj)
+// work around read image with the LD message. The coords are integer type.
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_1D_I, _Z21__gen_ocl_read_imageijtij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_1D_I, _Z22__gen_ocl_read_imageuijtij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_1D_I, _Z21__gen_ocl_read_imagefjtij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_2D_I, _Z21__gen_ocl_read_imageijtiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_2D_I, _Z22__gen_ocl_read_imageuijtiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_2D_I, _Z21__gen_ocl_read_imagefjtiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_3D_I, _Z21__gen_ocl_read_imageijtiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_3D_I, _Z22__gen_ocl_read_imageuijtiiij)
+DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_3D_I, _Z21__gen_ocl_read_imagefjtiiij)
+
+// To write_image functions.
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I_1D, _Z22__gen_ocl_write_imageijiDv4_i)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_UI_1D, _Z23__gen_ocl_write_imageuijiDv4_j)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_F_1D, _Z22__gen_ocl_write_imagefjiDv4_f)
+
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I_2D, _Z22__gen_ocl_write_imageijiiDv4_i)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_UI_2D, _Z23__gen_ocl_write_imageuijiiDv4_j)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_F_2D, _Z22__gen_ocl_write_imagefjiiDv4_f)
+
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I_3D, _Z22__gen_ocl_write_imageijiiiDv4_i)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_UI_3D, _Z23__gen_ocl_write_imageuijiiiDv4_j)
+DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_F_3D, _Z22__gen_ocl_write_imagefjiiiDv4_f)
+
+// To get image info function
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_WIDTH, __gen_ocl_get_image_width)
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_HEIGHT, __gen_ocl_get_image_height)
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_DEPTH,  __gen_ocl_get_image_depth)
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_CHANNEL_DATA_TYPE,  __gen_ocl_get_image_channel_data_type)
+DECL_LLVM_GEN_FUNCTION(GET_IMAGE_CHANNEL_ORDER,  __gen_ocl_get_image_channel_order)
+
+// atomic related functions.
+DECL_LLVM_GEN_FUNCTION(ATOMIC_ADD0, _Z20__gen_ocl_atomic_addPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_ADD1, _Z20__gen_ocl_atomic_addPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_SUB0, _Z20__gen_ocl_atomic_subPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_SUB1, _Z20__gen_ocl_atomic_subPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_AND0, _Z20__gen_ocl_atomic_andPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_AND1, _Z20__gen_ocl_atomic_andPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_OR0,  _Z19__gen_ocl_atomic_orPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_OR1,  _Z19__gen_ocl_atomic_orPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_XOR0, _Z20__gen_ocl_atomic_xorPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_XOR1, _Z20__gen_ocl_atomic_xorPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMIN0, _Z21__gen_ocl_atomic_uminPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMIN1, _Z21__gen_ocl_atomic_uminPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMAX0, _Z21__gen_ocl_atomic_umaxPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_UMAX1, _Z21__gen_ocl_atomic_umaxPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMIN0, _Z21__gen_ocl_atomic_iminPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMIN1, _Z21__gen_ocl_atomic_iminPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMAX0, _Z21__gen_ocl_atomic_imaxPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_IMAX1, _Z21__gen_ocl_atomic_imaxPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_XCHG0, _Z21__gen_ocl_atomic_xchgPU3AS1jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_XCHG1, _Z21__gen_ocl_atomic_xchgPU3AS3jj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_INC0, _Z20__gen_ocl_atomic_incPU3AS1j)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_INC1, _Z20__gen_ocl_atomic_incPU3AS3j)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_DEC0, _Z20__gen_ocl_atomic_decPU3AS1j)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_DEC1, _Z20__gen_ocl_atomic_decPU3AS3j)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_CMPXCHG0, _Z24__gen_ocl_atomic_cmpxchgPU3AS1jjj)
+DECL_LLVM_GEN_FUNCTION(ATOMIC_CMPXCHG1, _Z24__gen_ocl_atomic_cmpxchgPU3AS3jjj)
+
+// saturation related functions.
+DECL_LLVM_GEN_FUNCTION(SADD_SAT_CHAR, _Z12ocl_sadd_satcc)
+DECL_LLVM_GEN_FUNCTION(SADD_SAT_SHORT, _Z12ocl_sadd_satss)
+DECL_LLVM_GEN_FUNCTION(SADD_SAT_INT, _Z12ocl_sadd_satii)
+DECL_LLVM_GEN_FUNCTION(SADD_SAT_LONG, _Z12ocl_sadd_satll)
+DECL_LLVM_GEN_FUNCTION(UADD_SAT_CHAR, _Z12ocl_uadd_sathh)
+DECL_LLVM_GEN_FUNCTION(UADD_SAT_SHORT, _Z12ocl_uadd_sattt)
+DECL_LLVM_GEN_FUNCTION(UADD_SAT_INT, _Z12ocl_uadd_satjj)
+DECL_LLVM_GEN_FUNCTION(UADD_SAT_LONG, _Z12ocl_uadd_satmm)
+
+DECL_LLVM_GEN_FUNCTION(SSUB_SAT_CHAR, _Z12ocl_ssub_satcc)
+DECL_LLVM_GEN_FUNCTION(SSUB_SAT_SHORT, _Z12ocl_ssub_satss)
+DECL_LLVM_GEN_FUNCTION(SSUB_SAT_INT, _Z12ocl_ssub_satii)
+DECL_LLVM_GEN_FUNCTION(SSUB_SAT_LONG, _Z12ocl_ssub_satll)
+DECL_LLVM_GEN_FUNCTION(USUB_SAT_CHAR, _Z12ocl_usub_sathh)
+DECL_LLVM_GEN_FUNCTION(USUB_SAT_SHORT, _Z12ocl_usub_sattt)
+DECL_LLVM_GEN_FUNCTION(USUB_SAT_INT, _Z12ocl_usub_satjj)
+DECL_LLVM_GEN_FUNCTION(USUB_SAT_LONG, _Z12ocl_usub_satmm)
+
+DECL_LLVM_GEN_FUNCTION(I64_MAD_SAT, _Z17__gen_ocl_mad_satlll)
+DECL_LLVM_GEN_FUNCTION(I64_MAD_SATU, _Z17__gen_ocl_mad_satmmm)
+
+// integer built-in functions
+DECL_LLVM_GEN_FUNCTION(MUL_HI_INT, _Z16__gen_ocl_mul_hiii)
+DECL_LLVM_GEN_FUNCTION(MUL_HI_UINT, _Z16__gen_ocl_mul_hijj)
+DECL_LLVM_GEN_FUNCTION(MUL_HI_I64, _Z16__gen_ocl_mul_hill)
+DECL_LLVM_GEN_FUNCTION(MUL_HI_UI64, _Z16__gen_ocl_mul_himm)
+DECL_LLVM_GEN_FUNCTION(FBH, __gen_ocl_fbh)
+DECL_LLVM_GEN_FUNCTION(FBL, __gen_ocl_fbl)
+DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_abs)
+DECL_LLVM_GEN_FUNCTION(HADD, _Z14__gen_ocl_haddjj)
+DECL_LLVM_GEN_FUNCTION(RHADD, _Z15__gen_ocl_rhaddjj)
+DECL_LLVM_GEN_FUNCTION(I64HADD, _Z14__gen_ocl_haddmm)
+DECL_LLVM_GEN_FUNCTION(I64RHADD, _Z15__gen_ocl_rhaddmm)
+DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
+DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
+DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
+
+// saturate convert
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U8_TO_I8,  _Z16convert_char_sath)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I16_TO_I8, _Z16convert_char_sats)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U16_TO_I8, _Z16convert_char_satt)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_I8, _Z16convert_char_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_I8, _Z16convert_char_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_I8, _Z16convert_char_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I8_TO_U8,  _Z17convert_uchar_satc)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I16_TO_U8, _Z17convert_uchar_sats)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U16_TO_U8, _Z17convert_uchar_satt)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_U8, _Z17convert_uchar_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_U8, _Z17convert_uchar_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_U8, _Z17convert_uchar_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U16_TO_I16, _Z17convert_short_satt)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_I16, _Z17convert_short_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_I16, _Z17convert_short_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_I16, _Z17convert_short_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I16_TO_U16, _Z18convert_ushort_sats)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_U16, _Z18convert_ushort_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_U16, _Z18convert_ushort_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_U16, _Z18convert_ushort_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_U32_TO_I32, _Z15convert_int_satj)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_I32, _Z15convert_int_satf)
+
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_I32_TO_U32, _Z16convert_uint_sati)
+DECL_LLVM_GEN_FUNCTION(SAT_CONV_F32_TO_U32, _Z16convert_uint_satf)
+
+DECL_LLVM_GEN_FUNCTION(CONV_F16_TO_F32, __gen_ocl_f16to32)
+DECL_LLVM_GEN_FUNCTION(CONV_F32_TO_F16, __gen_ocl_f32to16)
+
+// SIMD level function for internal usage
+DECL_LLVM_GEN_FUNCTION(SIMD_ANY, __gen_ocl_simd_any)
+DECL_LLVM_GEN_FUNCTION(SIMD_ALL, __gen_ocl_simd_all)
+
+// printf function
+DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf)
+DECL_LLVM_GEN_FUNCTION(PRINTF_BUF_ADDR, __gen_ocl_printf_get_buf_addr)
+DECL_LLVM_GEN_FUNCTION(PRINTF_INDEX_BUF_ADDR, __gen_ocl_printf_get_index_buf_addr)
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
new file mode 100644
index 0000000..7d04318
--- /dev/null
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * \file llvm_intrinisc_lowering.cpp
+ * \author Yang Rong <rong.r.yang at intel.com>
+ */
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+
+
+using namespace llvm;
+
+namespace gbe {
+    class InstrinsicLowering : public BasicBlockPass
+    {
+    public:
+      static char ID;
+      InstrinsicLowering() :
+        BasicBlockPass(ID) {}
+
+      void getAnalysisUsage(AnalysisUsage &AU) const {
+
+      }
+
+      virtual const char *getPassName() const {
+        return "SPIR backend: lowering instrinsics";
+      }
+      static char convertSpaceToName(Value *val) {
+        const uint32_t space = val->getType()->getPointerAddressSpace();
+        switch(space) {
+          case 0:
+            return 'p';
+          case 1:
+            return 'g';
+          case 3:
+            return 'l';
+          default:
+            assert("Non support address space");
+            return '\0';
+        }
+      }
+      static CallInst *replaceCallWith(const char *NewFn, CallInst *CI,
+                                     Value **ArgBegin, Value **ArgEnd,
+                                     Type *RetTy)
+      {
+        // If we haven't already looked up this function, check to see if the
+        // program already contains a function with this name.
+        Module *M = CI->getParent()->getParent()->getParent();
+        // Get or insert the definition now.
+        std::vector<Type *> ParamTys;
+        for (Value** I = ArgBegin; I != ArgEnd; ++I)
+          ParamTys.push_back((*I)->getType());
+        Constant* FCache = M->getOrInsertFunction(NewFn,
+                                        FunctionType::get(RetTy, ParamTys, false));
+
+        IRBuilder<> Builder(CI->getParent(), CI);
+        SmallVector<Value *, 8> Args(ArgBegin, ArgEnd);
+        CallInst *NewCI = Builder.CreateCall(FCache, Args);
+        NewCI->setName(CI->getName());
+        if (!CI->use_empty())
+          CI->replaceAllUsesWith(NewCI);
+        CI->eraseFromParent();
+        return NewCI;
+      }
+      virtual bool runOnBasicBlock(BasicBlock &BB)
+      {
+        bool changedBlock = false;
+        Module *M = BB.getParent()->getParent();
+
+        DataLayout TD(M);
+        LLVMContext &Context = BB.getContext();
+        for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
+          Instruction *Inst = DI++;
+          CallInst* CI = dyn_cast<CallInst>(Inst);
+          if(CI == NULL)
+            continue;
+
+          IRBuilder<> Builder(&BB, CI);
+          // only support memcpy and memset
+          if (Function *F = CI->getCalledFunction()) {
+            const Intrinsic::ID intrinsicID = (Intrinsic::ID) F->getIntrinsicID();
+            if (intrinsicID == 0)
+              continue;
+            switch (intrinsicID) {
+              case Intrinsic::memcpy: {
+                Type *IntPtr = TD.getIntPtrType(Context);
+                Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+                                                    /* isSigned */ false);
+                Value *Ops[3];
+                Ops[0] = CI->getArgOperand(0);
+                Ops[1] = CI->getArgOperand(1);
+                Ops[2] = Size;
+                char name[16] = "__gen_memcpy_xx";
+                name[13] = convertSpaceToName(Ops[0]);
+                name[14] = convertSpaceToName(Ops[1]);
+                replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
+                break;
+              }
+              case Intrinsic::memset: {
+                Value *Op0 = CI->getArgOperand(0);
+                Value *val = Builder.CreateIntCast(CI->getArgOperand(1), IntegerType::getInt8Ty(Context),
+                                                    /* isSigned */ false);
+                Type *IntPtr = TD.getIntPtrType(Op0->getType());
+                Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+                                                    /* isSigned */ false);
+                Value *Ops[3];
+                Ops[0] = Op0;
+                // Extend the amount to i32.
+                Ops[1] = val;
+                Ops[2] = Size;
+                char name[16] = "__gen_memset_x";
+                name[13] = convertSpaceToName(Ops[0]);
+                replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
+                break;
+              }
+              default:
+                continue;
+            }
+          }
+        }
+        return changedBlock;
+      }
+    };
+
+    char InstrinsicLowering::ID = 0;
+
+    BasicBlockPass *createIntrinsicLoweringPass() {
+      return new InstrinsicLowering();
+    }
+} // end namespace
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
new file mode 100644
index 0000000..4bfc7f6
--- /dev/null
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -0,0 +1,272 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Ruiling, Song <ruiling.song at intel.com>
+ *
+ * The Idea is that: As GEN support at most 4 successive DWORD load/store,
+ * then merge successive load/store that are compatible is beneficial.
+ * The method of checking whether two load/store is compatible are borrowed
+ * from Vectorize passes in llvm.
+ */
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+
+#include "llvm/Config/llvm-config.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+
+using namespace llvm;
+namespace gbe {
+  class GenLoadStoreOptimization : public BasicBlockPass {
+
+  public:
+    static char ID;
+    ScalarEvolution *SE;
+    const DataLayout *TD;
+    GenLoadStoreOptimization() : BasicBlockPass(ID) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<ScalarEvolution>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.setPreservesCFG();
+    }
+
+    virtual bool runOnBasicBlock(BasicBlock &BB) {
+      SE = &getAnalysis<ScalarEvolution>();
+      #if LLVM_VERSION_MINOR >= 5
+        DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+        TD = DLP ? &DLP->getDataLayout() : nullptr;
+      #else
+        TD = getAnalysisIfAvailable<DataLayout>();
+      #endif
+      return optimizeLoadStore(BB);
+    }
+    Type    *getValueType(Value *insn);
+    Value   *getPointerOperand(Value *I);
+    unsigned getAddressSpace(Value *I);
+    bool     isSimpleLoadStore(Value *I);
+    bool     optimizeLoadStore(BasicBlock &BB);
+
+    bool     isLoadStoreCompatible(Value *A, Value *B);
+    void     mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
+    void     mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
+    BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
+                                               SmallVector<Instruction*, 4> &merged,
+                                               BasicBlock::iterator &start,
+                                               unsigned maxLimit,
+                                               bool isLoad);
+
+    virtual const char *getPassName() const {
+      return "Merge compatible Load/stores for Gen";
+    }
+  };
+
+  char GenLoadStoreOptimization::ID = 0;
+
+  Value *GenLoadStoreOptimization::getPointerOperand(Value *I) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
+    if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
+    return NULL;
+  }
+  unsigned GenLoadStoreOptimization::getAddressSpace(Value *I) {
+    if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
+    if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
+    return -1;
+  }
+  bool GenLoadStoreOptimization::isSimpleLoadStore(Value *I) {
+    if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->isSimple();
+    if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->isSimple();
+    return false;
+  }
+  Type *GenLoadStoreOptimization::getValueType(Value *insn) {
+    if(LoadInst *ld = dyn_cast<LoadInst>(insn)) return ld->getType();
+    if(StoreInst *st = dyn_cast<StoreInst>(insn)) return st->getValueOperand()->getType();
+
+    return NULL;
+  }
+
+  bool GenLoadStoreOptimization::isLoadStoreCompatible(Value *A, Value *B) {
+    Value *ptrA = getPointerOperand(A);
+    Value *ptrB = getPointerOperand(B);
+    unsigned ASA = getAddressSpace(A);
+    unsigned ASB = getAddressSpace(B);
+
+    // Check that the address spaces match and that the pointers are valid.
+    if (!ptrA || !ptrB || (ASA != ASB)) return false;
+
+    if(!isSimpleLoadStore(A) || !isSimpleLoadStore(B)) return false;
+    // Check that A and B are of the same type.
+    if (ptrA->getType() != ptrB->getType()) return false;
+
+    // Calculate the distance.
+    const SCEV *ptrSCEVA = SE->getSCEV(ptrA);
+    const SCEV *ptrSCEVB = SE->getSCEV(ptrB);
+    const SCEV *offsetSCEV = SE->getMinusSCEV(ptrSCEVA, ptrSCEVB);
+    const SCEVConstant *constOffSCEV = dyn_cast<SCEVConstant>(offsetSCEV);
+
+    // Non constant distance.
+    if (!constOffSCEV) return false;
+
+    int64_t offset = constOffSCEV->getValue()->getSExtValue();
+    Type *Ty = cast<PointerType>(ptrA->getType())->getElementType();
+    // The Instructions are connsecutive if the size of the first load/store is
+    // the same as the offset.
+    int64_t sz = TD->getTypeStoreSize(Ty);
+    return ((-offset) == sz);
+  }
+
+  void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
+    IRBuilder<> Builder(&BB);
+
+    unsigned size = merged.size();
+    SmallVector<Value *, 4> values;
+    for(unsigned i = 0; i < size; i++) {
+      values.push_back(merged[i]);
+    }
+    LoadInst *ld = cast<LoadInst>(merged[0]);
+    unsigned align = ld->getAlignment();
+    unsigned addrSpace = ld->getPointerAddressSpace();
+    // insert before first load
+    Builder.SetInsertPoint(ld);
+    VectorType *vecTy = VectorType::get(ld->getType(), size);
+    Value *vecPtr = Builder.CreateBitCast(ld->getPointerOperand(),
+                                          PointerType::get(vecTy, addrSpace));
+    LoadInst *vecValue = Builder.CreateLoad(vecPtr);
+    vecValue->setAlignment(align);
+
+    for (unsigned i = 0; i < size; ++i) {
+      Value *S = Builder.CreateExtractElement(vecValue, Builder.getInt32(i));
+      values[i]->replaceAllUsesWith(S);
+    }
+  }
+
+  BasicBlock::iterator
+  GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
+                            SmallVector<Instruction*, 4> &merged,
+                            BasicBlock::iterator &start,
+                            unsigned maxLimit,
+                            bool isLoad) {
+
+    BasicBlock::iterator stepForward = start;
+    if(!isSimpleLoadStore(start)) return stepForward;
+
+    merged.push_back(start);
+
+    BasicBlock::iterator E = BB.end();
+    BasicBlock::iterator J = ++start;
+
+    for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
+      if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
+        if(isLoadStoreCompatible(merged[merged.size()-1], J)) {
+          merged.push_back(J);
+          stepForward = ++J;
+        }
+      } else if((isLoad && isa<StoreInst>(*J)) || (!isLoad && isa<LoadInst>(*J))) {
+        // simple stop to keep read/write order
+        break;
+      }
+
+      if(merged.size() >= 4) break;
+    }
+    return stepForward;
+  }
+
+  void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
+    IRBuilder<> Builder(&BB);
+
+    unsigned size = merged.size();
+    SmallVector<Value *, 4> values;
+    for(unsigned i = 0; i < size; i++) {
+      values.push_back(cast<StoreInst>(merged[i])->getValueOperand());
+    }
+    StoreInst *st = cast<StoreInst>(merged[0]);
+    unsigned addrSpace = st->getPointerAddressSpace();
+
+    unsigned align = st->getAlignment();
+    // insert before the last store
+    Builder.SetInsertPoint(merged[size-1]);
+
+    Type *dataTy = st->getValueOperand()->getType();
+    VectorType *vecTy = VectorType::get(dataTy, size);
+    Value * parent = UndefValue::get(vecTy);
+    for(unsigned i = 0; i < size; i++) {
+      parent = Builder.CreateInsertElement(parent, values[i], ConstantInt::get(IntegerType::get(st->getContext(), 32), i));
+    }
+
+    Value *newPtr = Builder.CreateBitCast(st->getPointerOperand(), PointerType::get(vecTy, addrSpace));
+    StoreInst *newST = Builder.CreateStore(parent, newPtr);
+    newST->setAlignment(align);
+  }
+
+  bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
+    bool changed = false;
+    SmallVector<Instruction*, 4> merged;
+    for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) {
+      if(isa<LoadInst>(*BBI) || isa<StoreInst>(*BBI)) {
+        bool isLoad = isa<LoadInst>(*BBI) ? true: false;
+        Type *ty = getValueType(BBI);
+        if(ty->isVectorTy()) continue;
+        // we only support DWORD data type merge
+        if(!ty->isFloatTy() && !ty->isIntegerTy(32)) continue;
+        BBI = findConsecutiveAccess(BB, merged, BBI, 10, isLoad);
+        if(merged.size() > 1) {
+          if(isLoad)
+            mergeLoad(BB, merged);
+          else
+            mergeStore(BB, merged);
+          // remove merged insn
+          int size = merged.size();
+          for(int i = 0; i < size; i++)
+            merged[i]->eraseFromParent();
+          changed = true;
+        }
+        merged.clear();
+      }
+    }
+    return changed;
+  }
+
+  BasicBlockPass *createLoadStoreOptimizationPass() {
+    return new GenLoadStoreOptimization();
+  }
+};
+
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
new file mode 100644
index 0000000..1a38a0c
--- /dev/null
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -0,0 +1,399 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ *         Heldge RHodin <alice.rhodin at alice-dsl.net>
+ */
+
+/**
+ * \file llvm_passes.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ * \author Heldge RHodin <alice.rhodin at alice-dsl.net>
+ */
+
+/* THIS CODE IS DERIVED FROM GPL LLVM PTX BACKEND. CODE IS HERE:
+ * http://sourceforge.net/scm/?type=git&group_id=319085
+ * Note that however, the original author, Heldge Rhodin, granted me (Benjamin
+ * Segovia) the right to use another license for it (MIT here)
+ */
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#else
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Instructions.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/InlineAsm.h"
+#else
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/InlineAsm.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/ConstantsScanner.h"
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+#include "llvm/IR/Mangler.h"
+#else
+#include "llvm/Target/Mangler.h"
+#endif
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#if !defined(LLVM_VERSION_MAJOR) || (LLVM_VERSION_MINOR == 1)
+#include "llvm/Target/TargetData.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/DataLayout.h"
+#endif
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR <= 2)
+#include "llvm/Support/InstVisitor.h"
+#elif LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/InstVisitor.h"
+#else
+#include "llvm/InstVisitor.h"
+#endif
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/SourceMgr.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "ir/unit.hpp"
+#include "sys/map.hpp"
+
+using namespace llvm;
+
+namespace gbe
+{
+  bool isKernelFunction(const llvm::Function &F) {
+    const Module *module = F.getParent();
+    const Module::NamedMDListType& globalMD = module->getNamedMDList();
+    bool bKernel = false;
+    for(auto i = globalMD.begin(); i != globalMD.end(); i++) {
+      const NamedMDNode &md = *i;
+      if(strcmp(md.getName().data(), "opencl.kernels") != 0) continue;
+      uint32_t ops = md.getNumOperands();
+      for(uint32_t x = 0; x < ops; x++) {
+        MDNode* node = md.getOperand(x);
+        Value * op = node->getOperand(0);
+        if(op == &F) bKernel = true;
+      }
+    }
+    return bKernel;
+  }
+
+  uint32_t getPadding(uint32_t offset, uint32_t align) {
+    return (align - (offset % align)) % align; 
+  }
+
+  uint32_t getAlignmentByte(const ir::Unit &unit, Type* Ty)
+  {
+    switch (Ty->getTypeID()) {
+      case Type::VoidTyID: NOT_SUPPORTED;
+      case Type::VectorTyID:
+      {
+        const VectorType* VecTy = cast<VectorType>(Ty);
+        uint32_t elemNum = VecTy->getNumElements();
+        if (elemNum == 3) elemNum = 4; // OCL spec
+        return elemNum * getTypeByteSize(unit, VecTy->getElementType());
+      }
+      case Type::PointerTyID:
+      case Type::IntegerTyID:
+      case Type::FloatTyID:
+      case Type::DoubleTyID:
+      case Type::HalfTyID:
+        return getTypeBitSize(unit, Ty)/8;
+      case Type::ArrayTyID:
+        return getAlignmentByte(unit, cast<ArrayType>(Ty)->getElementType());
+      case Type::StructTyID:
+      {
+        const StructType* StrTy = cast<StructType>(Ty);
+        uint32_t maxa = 0;
+        for(uint32_t subtype = 0; subtype < StrTy->getNumElements(); subtype++)
+        {
+          maxa = std::max(getAlignmentByte(unit, StrTy->getElementType(subtype)), maxa);
+        }
+        return maxa;
+      }
+      default: NOT_SUPPORTED;
+    }
+    return 0u;
+  }
+
+  uint32_t getTypeBitSize(const ir::Unit &unit, Type* Ty)
+  {
+    switch (Ty->getTypeID()) {
+      case Type::VoidTyID:    NOT_SUPPORTED;
+      case Type::PointerTyID: return unit.getPointerSize();
+      case Type::IntegerTyID:
+      {
+        // use S16 to represent SLM bool variables.
+        int bitWidth = cast<IntegerType>(Ty)->getBitWidth();
+        return (bitWidth == 1) ? 16 : bitWidth;
+      }
+      case Type::HalfTyID:    return 16;
+      case Type::FloatTyID:   return 32;
+      case Type::DoubleTyID:  return 64;
+      case Type::VectorTyID:
+      {
+        const VectorType* VecTy = cast<VectorType>(Ty);
+        uint32_t numElem = VecTy->getNumElements();
+        if(numElem == 3) numElem = 4; // OCL spec
+        return numElem * getTypeBitSize(unit, VecTy->getElementType());
+      }
+      case Type::ArrayTyID:
+      {
+        const ArrayType* ArrTy = cast<ArrayType>(Ty);
+        Type* elementType = ArrTy->getElementType();
+        uint32_t size_element = getTypeBitSize(unit, elementType);
+        uint32_t size = ArrTy->getNumElements() * size_element;
+        uint32_t align = 8 * getAlignmentByte(unit, elementType);
+        size += (ArrTy->getNumElements()-1) * getPadding(size_element, align);
+        return size;
+      }
+      case Type::StructTyID:
+      {
+        const StructType* StrTy = cast<StructType>(Ty);
+        uint32_t size = 0;
+        for(uint32_t subtype=0; subtype < StrTy->getNumElements(); subtype++)
+        {
+          Type* elementType = StrTy->getElementType(subtype);
+          uint32_t align = 8 * getAlignmentByte(unit, elementType);
+          size += getPadding(size, align);
+          size += getTypeBitSize(unit, elementType);
+        }
+        return size;
+      }
+      default: NOT_SUPPORTED;
+    }
+    return 0u;
+  }
+
+  uint32_t getTypeByteSize(const ir::Unit &unit, Type* Ty)
+  {
+    uint32_t size_bit = getTypeBitSize(unit, Ty);
+    assert((size_bit%8==0) && "no multiple of 8");
+    return size_bit/8;
+  }
+
+  class GenRemoveGEPPasss : public BasicBlockPass
+  {
+
+   public:
+    static char ID;
+#define FORMER_VERSION 0
+#if FORMER_VERSION
+   GenRemoveGEPPasss(map<const Value *, const Value *>& 
+                                       parentCompositePointer)
+     : BasicBlockPass(ID),
+     parentPointers(parentCompositePointer) {}
+    map<const Value *, const Value *>& parentPointers;
+#else
+   GenRemoveGEPPasss(const ir::Unit &unit) :
+     BasicBlockPass(ID),
+     unit(unit) {}
+  const ir::Unit &unit;
+#endif
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+
+    virtual const char *getPassName() const {
+      return "SPIR backend: insert special spir instructions";
+    }
+
+    bool simplifyGEPInstructions(GetElementPtrInst* GEPInst);
+
+    virtual bool runOnBasicBlock(BasicBlock &BB)
+    {
+      bool changedBlock = false;
+      iplist<Instruction>::iterator I = BB.getInstList().begin();
+      for (auto nextI = I, E = --BB.getInstList().end(); I != E; I = nextI) {
+        iplist<Instruction>::iterator I = nextI++;
+        if(GetElementPtrInst* gep = dyn_cast<GetElementPtrInst>(&*I))
+          changedBlock = (simplifyGEPInstructions(gep) || changedBlock);
+      }
+      return changedBlock;
+    }
+  };
+
+  char GenRemoveGEPPasss::ID = 0;
+
+  bool GenRemoveGEPPasss::simplifyGEPInstructions(GetElementPtrInst* GEPInst)
+  {
+    const uint32_t ptrSize = unit.getPointerSize();
+    Value* parentPointer = GEPInst->getOperand(0);
+#if FORMER_VERSION
+    Value* topParent = parentPointer;
+#endif
+    CompositeType* CompTy = cast<CompositeType>(parentPointer->getType());
+
+    Value* currentAddrInst = 
+      new PtrToIntInst(parentPointer, IntegerType::get(GEPInst->getContext(), ptrSize), "", GEPInst);
+
+    uint32_t constantOffset = 0;
+
+    for(uint32_t op=1; op<GEPInst->getNumOperands(); ++op)
+    {
+      uint32_t TypeIndex;
+      //we have a constant struct/array acces
+      if(ConstantInt* ConstOP = dyn_cast<ConstantInt>(GEPInst->getOperand(op)))
+      {
+        uint32_t offset = 0;
+        TypeIndex = ConstOP->getZExtValue();
+        if (op == 1) {
+          if (TypeIndex != 0) {
+            Type *elementType = (cast<PointerType>(parentPointer->getType()))->getElementType();
+            uint32_t elementSize = getTypeByteSize(unit, elementType);
+            uint32_t align = getAlignmentByte(unit, elementType);
+            elementSize += getPadding(elementSize, align);
+            offset += elementSize * TypeIndex;
+          }
+        } else {
+          for(uint32_t ty_i=0; ty_i<TypeIndex; ty_i++)
+          {
+            Type* elementType = CompTy->getTypeAtIndex(ty_i);
+            uint32_t align = getAlignmentByte(unit, elementType);
+            offset += getPadding(offset, align);
+            offset += getTypeByteSize(unit, elementType);
+          }
+
+          //add getPaddingding for accessed type
+          const uint32_t align = getAlignmentByte(unit, CompTy->getTypeAtIndex(TypeIndex));
+          offset += getPadding(offset, align);
+        }
+
+        constantOffset += offset;
+      }
+      // none constant index (=> only array/verctor allowed)
+      else
+      {
+        // we only have array/vectors here, 
+        // therefore all elements have the same size
+        TypeIndex = 0;
+
+        Type* elementType = CompTy->getTypeAtIndex(TypeIndex);
+        uint32_t size = getTypeByteSize(unit, elementType);
+
+        //add padding
+        uint32_t align = getAlignmentByte(unit, elementType);
+        size += getPadding(size, align);
+
+        Constant* newConstSize = 
+          ConstantInt::get(IntegerType::get(GEPInst->getContext(), ptrSize), size);
+
+        Value *operand = GEPInst->getOperand(op); 
+
+        //HACK TODO: Inserted by type replacement.. this code could break something????
+        if(getTypeByteSize(unit, operand->getType())>4)
+        {
+          GBE_ASSERTM(false, "CHECK IT");
+          operand->dump();
+
+          //previous instruction is sext or zext instr. ignore it
+          CastInst *cast = dyn_cast<CastInst>(operand);
+          if(cast && (isa<ZExtInst>(operand) || isa<SExtInst>(operand)))
+          {
+            //hope that CastInst is a s/zext
+            operand = cast->getOperand(0);
+          }
+          else
+          {
+            //trunctate
+            operand = 
+              new TruncInst(operand, 
+                  IntegerType::get(GEPInst->getContext(), 
+                    ptrSize), 
+                  "", GEPInst);
+          }
+        }
+
+        BinaryOperator* tmpMul = 
+          BinaryOperator::Create(Instruction::Mul, newConstSize, operand,
+              "", GEPInst);
+        currentAddrInst = 
+          BinaryOperator::Create(Instruction::Add, currentAddrInst, tmpMul,
+              "", GEPInst);
+      }
+
+      //step down in type hirachy
+      CompTy = dyn_cast<CompositeType>(CompTy->getTypeAtIndex(TypeIndex));
+    }
+
+    //insert addition of new offset before GEPInst
+    Constant* newConstOffset = 
+      ConstantInt::get(IntegerType::get(GEPInst->getContext(), 
+            ptrSize),
+          constantOffset);
+    currentAddrInst = 
+      BinaryOperator::Create(Instruction::Add, currentAddrInst, 
+          newConstOffset, "", GEPInst);
+
+    //convert offset to ptr type (nop)
+    IntToPtrInst* intToPtrInst = 
+      new IntToPtrInst(currentAddrInst,GEPInst->getType(),"", GEPInst);
+
+    //replace uses of the GEP instruction with the newly calculated pointer
+    GEPInst->replaceAllUsesWith(intToPtrInst);
+    GEPInst->dropAllReferences();
+    GEPInst->eraseFromParent();
+
+#if FORMER_VERSION
+    //insert new pointer into parent list
+    while(parentPointers.find(topParent)!=parentPointers.end())
+      topParent = parentPointers.find(topParent)->second;
+    parentPointers[intToPtrInst] = topParent;
+#endif
+
+    return true;
+  }
+
+  BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit) {
+    return new GenRemoveGEPPasss(unit);
+  }
+} /* namespace gbe */
+
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
new file mode 100644
index 0000000..00e1ef8
--- /dev/null
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -0,0 +1,851 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * \file llvm_printf_parser.cpp
+ *
+ * When there are printf functions existing, we have something to do here.
+ * Because the GPU's feature, it is relatively hard to parse and caculate the
+ * printf's format string. OpenCL 1.2 restrict the format string to be a
+ * constant string and can be decided at compiling time. So we add a pass here
+ * to parse the format string and check whether the parameters is valid.
+ * If all are valid, we will generate the according instruction to store the
+ * parameter content into the printf buffer. And if something is invalid, a
+ * warning is generated and the printf instruction is skipped in order to avoid
+ * GPU error. We also keep the relationship between the printf format and printf
+ * content in GPU's printf buffer here, and use the system's C standard printf to
+ * print the content after kernel executed.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
+#else
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#endif
+
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Attributes.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+#include "ir/printf.hpp"
+
+using namespace llvm;
+
+namespace gbe
+{
+  using namespace ir;
+
+  /* Return the conversion_specifier if succeed, -1 if failed. */
+  static char __parse_printf_state(char *begin, char *end, char** rend, PrintfState * state)
+  {
+    const char *fmt;
+    state->left_justified = 0;
+    state->sign_symbol = 0; //0 for nothing, 1 for sign, 2 for space.
+    state->alter_form = 0;
+    state->zero_padding = 0;
+    state->vector_n = 0;
+    state->min_width = -1;
+    state->precision = -1;
+    state->length_modifier = 0;
+    state->conversion_specifier = PRINTF_CONVERSION_INVALID;
+    state->out_buf_sizeof_offset = -1;
+
+    fmt = begin;
+
+    if (*fmt != '%')
+      return -1;
+
+#define FMT_PLUS_PLUS do {                                  \
+      if (fmt + 1 <= end) fmt++;                             \
+      else {                                                \
+        printf("Error, line: %d, fmt > end\n", __LINE__);   \
+        return -1;                                          \
+      }                                                     \
+    }  while(0)
+
+    FMT_PLUS_PLUS;
+
+    // parse the flags.
+    while (*fmt == '-' || *fmt == '+' || *fmt == ' ' || *fmt == '#' || *fmt == '0')
+      switch (*fmt) {
+        case '-':
+          /* The result of the conversion is left-justified within the field. */
+          state->left_justified = 1;
+          FMT_PLUS_PLUS;
+          break;
+        case '+':
+          /* The result of a signed conversion always begins with a plus or minus sign. */
+          state->sign_symbol = 1;
+          FMT_PLUS_PLUS;
+          break;
+        case ' ':
+          /* If the first character of a signed conversion is not a sign, or if a signed
+             conversion results in no characters, a space is prefixed to the result.
+             If the space and + flags both appear,the space flag is ignored. */
+          if (state->sign_symbol == 0) state->sign_symbol = 2;
+          FMT_PLUS_PLUS;
+          break;
+        case '#':
+          /*The result is converted to an alternative form. */
+          state->alter_form = 1;
+          FMT_PLUS_PLUS;
+          break;
+        case '0':
+          if (!state->left_justified) state->zero_padding = 1;
+          FMT_PLUS_PLUS;
+          break;
+        default:
+          break;
+      }
+
+    // The minimum field width
+    while ((*fmt >= '0') && (*fmt <= '9')) {
+      if (state->min_width < 0)
+        state->min_width = 0;
+      state->min_width = state->min_width * 10 + (*fmt - '0');
+      FMT_PLUS_PLUS;
+    }
+
+    // The precision
+    if (*fmt == '.') {
+      FMT_PLUS_PLUS;
+      state->precision = 0;
+      while (*fmt >= '0' && *fmt <= '9') {
+        state->precision = state->precision * 10 + (*fmt - '0');
+        FMT_PLUS_PLUS;
+      }
+    }
+
+    // handle the vector specifier.
+    if (*fmt == 'v') {
+      FMT_PLUS_PLUS;
+      switch (*fmt) {
+        case '2':
+        case '3':
+        case '4':
+        case '8':
+          state->vector_n = *fmt - '0';
+          FMT_PLUS_PLUS;
+          break;
+        case '1':
+          FMT_PLUS_PLUS;
+          if (*fmt == '6') {
+            state->vector_n = 16;
+            FMT_PLUS_PLUS;
+          } else
+            return -1;
+          break;
+        default:
+          //Wrong vector, error.
+          return -1;
+      }
+    }
+
+    // length modifiers
+    if (*fmt == 'h') {
+      FMT_PLUS_PLUS;
+      if (*fmt == 'h') { //hh
+        state->length_modifier = PRINTF_LM_HH;
+        FMT_PLUS_PLUS;
+      } else if (*fmt == 'l') { //hl
+        state->length_modifier = PRINTF_LM_HL;
+        FMT_PLUS_PLUS;
+      } else { //h
+        state->length_modifier = PRINTF_LM_H;
+      }
+    } else if (*fmt == 'l') {
+      state->length_modifier = PRINTF_LM_L;
+      FMT_PLUS_PLUS;
+    }
+
+#define CONVERSION_SPEC_AND_RET(XXX, xxx)                           \
+    case XXX:                                                       \
+      state->conversion_specifier = PRINTF_CONVERSION_##xxx;        \
+      FMT_PLUS_PLUS;                                                \
+      *rend = (char *)fmt;                                          \
+      return XXX;                                                   \
+      break;
+
+    // conversion specifiers
+    switch (*fmt) {
+        CONVERSION_SPEC_AND_RET('d', D)
+        CONVERSION_SPEC_AND_RET('i', I)
+        CONVERSION_SPEC_AND_RET('o', O)
+        CONVERSION_SPEC_AND_RET('u', U)
+        CONVERSION_SPEC_AND_RET('x', x)
+        CONVERSION_SPEC_AND_RET('X', X)
+        CONVERSION_SPEC_AND_RET('f', f)
+        CONVERSION_SPEC_AND_RET('F', F)
+        CONVERSION_SPEC_AND_RET('e', e)
+        CONVERSION_SPEC_AND_RET('E', E)
+        CONVERSION_SPEC_AND_RET('g', g)
+        CONVERSION_SPEC_AND_RET('G', G)
+        CONVERSION_SPEC_AND_RET('a', a)
+        CONVERSION_SPEC_AND_RET('A', A)
+        CONVERSION_SPEC_AND_RET('c', C)
+        CONVERSION_SPEC_AND_RET('s', S)
+        CONVERSION_SPEC_AND_RET('p', P)
+
+      // %% has been handled
+
+      default:
+        return -1;
+    }
+  }
+
+  static PrintfSet::PrintfFmt* parser_printf_fmt(char* format, int& num)
+  {
+    char* begin;
+    char* end;
+    char* p;
+    char ret_char;
+    char* rend;
+    PrintfState state;
+    PrintfSet::PrintfFmt* printf_fmt = new PrintfSet::PrintfFmt();
+
+    p = format;
+    begin = format;
+    end = format + strlen(format);
+
+    /* Now parse it. */
+    while (*begin) {
+      p = begin;
+
+again:
+      while (p < end && *p != '%') {
+        p++;
+      }
+      if (p < end && p + 1 == end) { // String with % at end.
+        printf("string end with %%\n");
+        goto error;
+      }
+      if (*(p + 1) == '%') { // %%
+        p += 2;
+        goto again;
+      }
+
+      if (p != begin) {
+        std::string s = std::string(begin, size_t(p - begin));
+        printf_fmt->push_back(PrintfSlot(s.c_str()));
+      }
+
+      if (p == end) // finish
+        break;
+
+      /* Now parse the % start conversion_specifier. */
+      ret_char = __parse_printf_state(p, end, &rend, &state);
+      if (ret_char < 0)
+        goto error;
+
+      printf_fmt->push_back(&state);
+      num++;
+
+      if (rend == end)
+        break;
+
+      begin = rend;
+    }
+
+#if 0
+    {
+      int j = 0;
+      for (auto &s : *printf_fmt) {
+        j++;
+        if (s.type == PRINTF_SLOT_TYPE_STATE) {
+          fprintf(stderr, "---- %d ---: state : \n", j);
+          fprintf(stderr, "		     left_justified : %d\n", s.state->left_justified);
+          fprintf(stderr, "		     sign_symbol: %d\n", s.state->sign_symbol);
+          fprintf(stderr, "		     alter_form : %d\n", s.state->alter_form);
+          fprintf(stderr, "		     zero_padding : %d\n", s.state->zero_padding);
+          fprintf(stderr, "		     vector_n : %d\n", s.state->vector_n);
+          fprintf(stderr, "		     min_width : %d\n", s.state->min_width);
+          fprintf(stderr, "		     precision : %d\n", s.state->precision);
+          fprintf(stderr, "		     length_modifier : %d\n", s.state->length_modifier);
+          fprintf(stderr, "		     conversion_specifier : %d\n", s.state->conversion_specifier);
+        } else if (s.type == PRINTF_SLOT_TYPE_STRING) {
+          fprintf(stderr, "---- %d ---: string :  %s\n", j, s.str);
+        }
+      }
+    }
+#endif
+
+    return printf_fmt;
+
+error:
+    printf("error format string.\n");
+    delete printf_fmt;
+    return NULL;
+  }
+
+  class PrintfParser : public FunctionPass
+  {
+  public:
+    static char ID;
+    typedef std::pair<Instruction*, bool> PrintfInst;
+    std::vector<PrintfInst> deadprintfs;
+    Module* module;
+    IRBuilder<>* builder;
+    Type* intTy;
+    Value* pbuf_ptr;
+    Value* index_buf_ptr;
+    int out_buf_sizeof_offset;
+    static map<CallInst*, PrintfSet::PrintfFmt*> printfs;
+    int printf_num;
+
+    PrintfParser(void) : FunctionPass(ID)
+    {
+      module = NULL;
+      builder = NULL;
+      intTy = NULL;
+      out_buf_sizeof_offset = 0;
+      printfs.clear();
+      pbuf_ptr = NULL;
+      index_buf_ptr = NULL;
+      printf_num = 0;
+    }
+
+    ~PrintfParser(void)
+    {
+      for (auto &s : printfs) {
+        delete s.second;
+        s.second = NULL;
+      }
+      printfs.clear();
+    }
+
+
+    bool parseOnePrintfInstruction(CallInst *& call);
+    bool generateOneParameterInst(PrintfSlot& slot, Value*& arg, Type*& dst_type, int& sizeof_size);
+
+    virtual const char *getPassName() const
+    {
+      return "Printf Parser";
+    }
+
+    virtual bool runOnFunction(llvm::Function &F);
+  };
+
+  bool PrintfParser::parseOnePrintfInstruction(CallInst *& call)
+  {
+    CallSite CS(call);
+    CallSite::arg_iterator CI_FMT = CS.arg_begin();
+    int param_num = 0;
+
+    llvm::Constant* arg0 = dyn_cast<llvm::ConstantExpr>(*CI_FMT);
+    llvm::Constant* arg0_ptr = dyn_cast<llvm::Constant>(arg0->getOperand(0));
+    if (!arg0_ptr) {
+      return false;
+    }
+
+    ConstantDataSequential* fmt_arg = dyn_cast<ConstantDataSequential>(arg0_ptr->getOperand(0));
+    if (!fmt_arg || !fmt_arg->isCString()) {
+      return false;
+    }
+
+    std::string fmt = fmt_arg->getAsCString();
+
+    PrintfSet::PrintfFmt* printf_fmt = NULL;
+
+    if (!(printf_fmt = parser_printf_fmt((char *)fmt.c_str(), param_num))) {//at lease print something
+      return false;
+    }
+
+    /* iff parameter more than %, error. */
+    /* str_fmt arg0 arg1 ... NULL */
+    if (param_num + 2 < static_cast<int>(call->getNumOperands())) {
+      delete printf_fmt;
+      return false;
+    }
+
+    /* FIXME: Because the OpenCL language do not support va macro, and we do not want
+       to introduce the va_list, va_start and va_end into our code, we just simulate
+       the function calls to caculate the offset caculation here. */
+#define BUILD_CALL_INST(name) \
+    CallInst* name = builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction( \
+                             "__gen_ocl_get_"#name,                                         \
+                             IntegerType::getInt32Ty(module->getContext()),                 \
+                             NULL)))
+
+    BUILD_CALL_INST(group_id2);
+    BUILD_CALL_INST(group_id1);
+    BUILD_CALL_INST(group_id0);
+    BUILD_CALL_INST(global_size2);
+    BUILD_CALL_INST(global_size1);
+    BUILD_CALL_INST(global_size0);
+    BUILD_CALL_INST(local_id2);
+    BUILD_CALL_INST(local_id1);
+    BUILD_CALL_INST(local_id0);
+    BUILD_CALL_INST(local_size2);
+    BUILD_CALL_INST(local_size1);
+    BUILD_CALL_INST(local_size0);
+
+#undef BUILD_CALL_INST
+
+    Value* op0 = NULL;
+    Value* val = NULL;
+    /* calculate offset for later usage.
+       offset = ((local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
+       + (local_id1 + local_size1 * group_id1) * global_size0
+       + (local_id0 + local_size0 * group_id0)) * sizeof(type)  */
+
+    // local_size2 * group_id2
+    val = builder->CreateMul(local_size2, group_id2);
+    // local_id2 + local_size2 * group_id2
+    val = builder->CreateAdd(local_id2, val);
+    // global_size1 * global_size0
+    op0 = builder->CreateMul(global_size1, global_size0);
+    // (local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
+    Value* offset1 = builder->CreateMul(val, op0);
+    // local_size1 * group_id1
+    val = builder->CreateMul(local_size1, group_id1);
+    // local_id1 + local_size1 * group_id1
+    val = builder->CreateAdd(local_id1, val);
+    // (local_id1 + local_size1 * group_id1) * global_size_0
+    Value* offset2 = builder->CreateMul(val, global_size0);
+    // local_size0 * group_id0
+    val = builder->CreateMul(local_size0, group_id0);
+    // local_id0 + local_size0 * group_id0
+    val = builder->CreateAdd(local_id0, val);
+    // The total sum
+    val = builder->CreateAdd(val, offset1);
+    Value* offset = builder->CreateAdd(val, offset2);
+
+    /////////////////////////////////////////////////////
+    /* calculate index address.
+       index_addr = (index_offset + offset )* sizeof(int) + index_buf_ptr
+       index_offset = global_size2 * global_size1 * global_size0 * printf_num */
+
+    // global_size2 * global_size1
+    op0 = builder->CreateMul(global_size2, global_size1);
+    // global_size2 * global_size1 * global_size0
+    Value* glXg2Xg3 = builder->CreateMul(op0, global_size0);
+    Value* index_offset = builder->CreateMul(glXg2Xg3, ConstantInt::get(intTy, printf_num));
+    // index_offset + offset
+    op0 = builder->CreateAdd(index_offset, offset);
+    // (index_offset + offset)* sizeof(int)
+    op0 = builder->CreateMul(op0, ConstantInt::get(intTy, sizeof(int)));
+    // Final index address = index_buf_ptr + (index_offset + offset)* sizeof(int)
+    op0 = builder->CreateAdd(index_buf_ptr, op0);
+    Value* index_addr = builder->CreateIntToPtr(op0, Type::getInt32PtrTy(module->getContext(), 1));
+    builder->CreateStore(ConstantInt::get(intTy, 1), index_addr);// The flag
+
+    int i = 1;
+    Value* data_addr = NULL;
+    for (auto &s : *printf_fmt) {
+      if (s.type == PRINTF_SLOT_TYPE_STRING)
+        continue;
+
+      assert(i < static_cast<int>(call->getNumOperands()) - 1);
+
+      Value *out_arg = call->getOperand(i);
+      Type *dst_type = NULL;
+      int sizeof_size = 0;
+      if (!generateOneParameterInst(s, out_arg, dst_type, sizeof_size)) {
+        printf("Printf: %d, parameter %d may have no result because some error\n",
+               printf_num, i - 1);
+        i++;
+        continue;
+      }
+
+      s.state->out_buf_sizeof_offset = out_buf_sizeof_offset;
+      if (!sizeof_size) {
+        i++;
+        continue;
+      }
+
+      assert(dst_type);
+
+      /////////////////////////////////////////////////////
+      /* Calculate the data address.
+      data_addr = data_offset + pbuf_ptr + offset * sizeof(specify)
+      data_offset = global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset
+
+      //global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset */
+      op0 = builder->CreateMul(glXg2Xg3, ConstantInt::get(intTy, out_buf_sizeof_offset));
+      //offset * sizeof(specify)
+      val = builder->CreateMul(offset, ConstantInt::get(intTy, sizeof_size));
+      //data_offset + pbuf_ptr
+      op0 = builder->CreateAdd(pbuf_ptr, op0);
+      op0 = builder->CreateAdd(op0, val);
+      data_addr = builder->CreateIntToPtr(op0, dst_type);
+      builder->CreateStore(out_arg, data_addr);
+
+      out_buf_sizeof_offset += ((sizeof_size + 3) / 4) * 4;
+      i++;
+    }
+
+    CallInst* printf_inst = builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction(
+                              "__gen_ocl_printf", Type::getVoidTy(module->getContext()),
+                              NULL)));
+    assert(printfs[printf_inst] == NULL);
+    printfs[printf_inst] = printf_fmt;
+    printf_num++;
+    return true;
+  }
+
+  bool PrintfParser::runOnFunction(llvm::Function &F)
+  {
+    bool changed = false;
+    switch (F.getCallingConv()) {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+      case CallingConv::PTX_Device:
+        return false;
+      case CallingConv::PTX_Kernel:
+#else
+      case CallingConv::C:
+#endif
+        break;
+      default:
+        GBE_ASSERTM(false, "Unsupported calling convention");
+    }
+
+    module = F.getParent();
+    intTy = IntegerType::get(module->getContext(), 32);
+
+    // As we inline all function calls, so skip non-kernel functions
+    bool bKernel = isKernelFunction(F);
+    if(!bKernel) return false;
+
+    builder = new IRBuilder<>(module->getContext());
+
+    /* Iter the function and find printf. */
+    for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
+      for (BasicBlock::iterator instI = B->begin(),
+           instE = B->end(); instI != instE; ++instI) {
+
+        llvm::CallInst* call = dyn_cast<llvm::CallInst>(instI);
+        if (!call) {
+          continue;
+        }
+
+        if (call->getCalledFunction()->getIntrinsicID() != 0)
+          continue;
+
+        Value *Callee = call->getCalledValue();
+        const std::string fnName = Callee->getName();
+
+        if (fnName != "__gen_ocl_printf_stub")
+          continue;
+
+        changed = true;
+
+        builder->SetInsertPoint(call);
+
+        if (!pbuf_ptr) {
+          /* alloc a new buffer ptr to collect the print output. */
+          Type *ptrTy = Type::getInt32PtrTy(module->getContext());
+          llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_buf"), ptrTy);
+          pbuf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
+        }
+        if (!index_buf_ptr) {
+          Type *ptrTy = Type::getInt32PtrTy(module->getContext());
+          llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_index_buf"), ptrTy);
+          index_buf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
+        }
+
+        deadprintfs.push_back(PrintfInst(cast<Instruction>(call),parseOnePrintfInstruction(call)));
+      }
+    }
+
+    /* Replace the instruction's operand if using printf's return value. */
+    for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
+      for (BasicBlock::iterator instI = B->begin(),
+           instE = B->end(); instI != instE; ++instI) {
+
+        for (unsigned i = 0; i < instI->getNumOperands(); i++) {
+          for (auto &prf : deadprintfs) {
+            if (instI->getOperand(i) == prf.first) {
+
+              if (prf.second == true) {
+                instI->setOperand(i, ConstantInt::get(intTy, 0));
+              } else {
+                instI->setOperand(i, ConstantInt::get(intTy, -1));
+              }
+            }
+          }
+        }
+      }
+    }
+
+    /* Kill the dead printf instructions. */
+    for (auto &prf : deadprintfs) {
+      prf.first->dropAllReferences();
+      if (prf.first->use_empty())
+        prf.first->eraseFromParent();
+    }
+
+    deadprintfs.clear();
+    delete builder;
+
+    return changed;
+  }
+
+  bool PrintfParser::generateOneParameterInst(PrintfSlot& slot, Value*& arg, Type*& dst_type, int& sizeof_size)
+  {
+    assert(slot.type == PRINTF_SLOT_TYPE_STATE);
+    assert(builder);
+
+    /* Check whether the arg match the format specifer. If needed, some
+       conversion need to be applied. */
+    switch (arg->getType()->getTypeID()) {
+      case Type::IntegerTyID: {
+        bool sign = false;
+        switch (slot.state->conversion_specifier) {
+          case PRINTF_CONVERSION_I:
+          case PRINTF_CONVERSION_D:
+            sign = true;
+          case PRINTF_CONVERSION_O:
+          case PRINTF_CONVERSION_U:
+          case PRINTF_CONVERSION_x:
+          case PRINTF_CONVERSION_X:
+            /* If the bits change, we need to consider the signed. */
+            if (arg->getType() != Type::getInt32Ty(module->getContext())) {
+              arg = builder->CreateIntCast(arg, Type::getInt32Ty(module->getContext()), sign);
+            }
+
+            /* Int to Int, just store. */
+            dst_type = Type::getInt32PtrTy(module->getContext(), 1);
+            sizeof_size = sizeof(int);
+            return true;
+
+          case PRINTF_CONVERSION_C:
+            /* Int to Char, add a conversion. */
+            arg = builder->CreateIntCast(arg, Type::getInt8Ty(module->getContext()), false);
+            dst_type = Type::getInt8PtrTy(module->getContext(), 1);
+            sizeof_size = sizeof(char);
+            return true;
+
+          case PRINTF_CONVERSION_F:
+          case PRINTF_CONVERSION_f:
+          case PRINTF_CONVERSION_E:
+          case PRINTF_CONVERSION_e:
+          case PRINTF_CONVERSION_G:
+          case PRINTF_CONVERSION_g:
+          case PRINTF_CONVERSION_A:
+          case PRINTF_CONVERSION_a:
+            printf("Warning: Have a float paramter for %%d like specifier, take care of it\n");
+            arg = builder->CreateSIToFP(arg, Type::getFloatTy(module->getContext()));
+            dst_type = Type::getFloatPtrTy(module->getContext(), 1);
+            sizeof_size = sizeof(float);
+            return true;
+
+          case PRINTF_CONVERSION_S:
+            /* Here, the case is printf("xxx%s", 0); we should output the null. */
+            sizeof_size = 0;
+            slot.state->str = "(null)";
+            return true;
+
+          default:
+            return false;
+        }
+
+        break;
+      }
+
+      case Type::DoubleTyID:
+      case Type::FloatTyID: {
+        /* Because the printf is a variable parameter function, it does not have the
+           function prototype, so the compiler will always promote the arg to the
+           longest precise type for float. So here, we can always find it is double. */
+        switch (slot.state->conversion_specifier) {
+          case PRINTF_CONVERSION_I:
+          case PRINTF_CONVERSION_D:
+            /* Float to Int, add a conversion. */
+            printf("Warning: Have a int paramter for %%f like specifier, take care of it\n");
+            arg = builder->CreateFPToSI(arg, Type::getInt32Ty(module->getContext()));
+            dst_type = Type::getInt32PtrTy(module->getContext(), 1);
+            sizeof_size = sizeof(int);
+            return true;
+
+          case PRINTF_CONVERSION_O:
+          case PRINTF_CONVERSION_U:
+          case PRINTF_CONVERSION_x:
+          case PRINTF_CONVERSION_X:
+            /* Float to uint, add a conversion. */
+            printf("Warning: Have a uint paramter for %%f like specifier, take care of it\n");
+            arg = builder->CreateFPToUI(arg, Type::getInt32Ty(module->getContext()));
+            dst_type = Type::getInt32PtrTy(module->getContext(), 1);
+            sizeof_size = sizeof(int);
+            return true;
+
+          case PRINTF_CONVERSION_F:
+          case PRINTF_CONVERSION_f:
+          case PRINTF_CONVERSION_E:
+          case PRINTF_CONVERSION_e:
+          case PRINTF_CONVERSION_G:
+          case PRINTF_CONVERSION_g:
+          case PRINTF_CONVERSION_A:
+          case PRINTF_CONVERSION_a:
+            arg = builder->CreateFPCast(arg, Type::getFloatTy(module->getContext()));
+            dst_type = Type::getFloatPtrTy(module->getContext(), 1);
+            sizeof_size = sizeof(float);
+            return true;
+
+          default:
+            return false;
+        }
+
+        break;
+      }
+
+      /* %p and %s */
+      case Type::PointerTyID:
+        switch (slot.state->conversion_specifier) {
+          case PRINTF_CONVERSION_S: {
+            llvm::Constant* arg0 = dyn_cast<llvm::ConstantExpr>(arg);
+            llvm::Constant* arg0_ptr = dyn_cast<llvm::Constant>(arg0->getOperand(0));
+            if (!arg0_ptr) {
+              return false;
+            }
+
+            ConstantDataSequential* fmt_arg = dyn_cast<ConstantDataSequential>(arg0_ptr->getOperand(0));
+            if (!fmt_arg || !fmt_arg->isCString()) {
+              return false;
+            }
+            sizeof_size = 0;
+            slot.state->str = fmt_arg->getAsCString();
+            return true;
+          }
+          case PRINTF_CONVERSION_P: {
+            arg = builder->CreatePtrToInt(arg, Type::getInt32Ty(module->getContext()));
+            dst_type = arg->getType()->getPointerTo(1);
+            sizeof_size = sizeof(int);
+            return true;
+          }
+          default:
+            return false;
+        }
+
+        break;
+
+      case Type::VectorTyID: {
+        Type* vect_type = arg->getType();
+        Type* elt_type = vect_type->getVectorElementType();
+        int vec_num = vect_type->getVectorNumElements();
+        bool sign = false;
+
+        if (vec_num != slot.state->vector_n) {
+          return false;
+        }
+
+        switch (slot.state->conversion_specifier) {
+          case PRINTF_CONVERSION_I:
+          case PRINTF_CONVERSION_D:
+            sign = true;
+          case PRINTF_CONVERSION_O:
+          case PRINTF_CONVERSION_U:
+          case PRINTF_CONVERSION_x:
+          case PRINTF_CONVERSION_X:
+            if (elt_type->getTypeID() != Type::IntegerTyID)
+              return false;
+
+            /* If the bits change, we need to consider the signed. */
+            if (elt_type != Type::getInt32Ty(elt_type->getContext())) {
+              Value *II = NULL;
+              for (int i = 0; i < vec_num; i++) {
+                Value *vec = II ? II : UndefValue::get(VectorType::get(Type::getInt32Ty(elt_type->getContext()), vec_num));
+                Value *cv = ConstantInt::get(Type::getInt32Ty(elt_type->getContext()), i);
+                Value *org = builder->CreateExtractElement(arg, cv);
+                Value *cvt = builder->CreateIntCast(org, Type::getInt32Ty(module->getContext()), sign);
+                II = builder->CreateInsertElement(vec, cvt, cv);
+              }
+              arg = II;
+            }
+
+            dst_type = arg->getType()->getPointerTo(1);
+            sizeof_size = sizeof(int) * vec_num;
+            return true;
+
+          case PRINTF_CONVERSION_F:
+          case PRINTF_CONVERSION_f:
+          case PRINTF_CONVERSION_E:
+          case PRINTF_CONVERSION_e:
+          case PRINTF_CONVERSION_G:
+          case PRINTF_CONVERSION_g:
+          case PRINTF_CONVERSION_A:
+          case PRINTF_CONVERSION_a:
+            if (elt_type->getTypeID() != Type::DoubleTyID && elt_type->getTypeID() != Type::FloatTyID)
+              return false;
+
+            if (elt_type->getTypeID() != Type::FloatTyID) {
+              Value *II = NULL;
+              for (int i = 0; i < vec_num; i++) {
+                Value *vec = II ? II : UndefValue::get(VectorType::get(Type::getFloatTy(elt_type->getContext()), vec_num));
+                Value *cv = ConstantInt::get(Type::getInt32Ty(elt_type->getContext()), i);
+                Value *org = builder->CreateExtractElement(arg, cv);
+                Value* cvt  = builder->CreateFPCast(org, Type::getFloatTy(module->getContext()));
+                II = builder->CreateInsertElement(vec, cvt, cv);
+              }
+              arg = II;
+            }
+        }
+        dst_type = arg->getType()->getPointerTo(1);
+        sizeof_size = sizeof(int) * vec_num;
+        return true;
+      }
+
+      default:
+        return false;
+    }
+
+    return false;
+  }
+
+  map<CallInst*, PrintfSet::PrintfFmt*> PrintfParser::printfs;
+
+  void* getPrintfInfo(CallInst* inst)
+  {
+    if (PrintfParser::printfs[inst])
+      return (void*)PrintfParser::printfs[inst];
+    return NULL;
+  }
+
+  FunctionPass* createPrintfParserPass()
+  {
+    return new PrintfParser();
+  }
+  char PrintfParser::ID = 0;
+
+} // end namespace
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
new file mode 100644
index 0000000..3e48fbf
--- /dev/null
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -0,0 +1,878 @@
+/**
+ * \file llvm_scalarize.cpp
+ *
+ * This file is derived from:
+ *  https://code.google.com/p/lunarglass/source/browse/trunk/Core/Passes/Transforms/Scalarize.cpp?r=903
+ */
+
+//===- Scalarize.cpp - Scalarize LunarGLASS IR ----------------------------===//
+//
+// LunarGLASS: An Open Modular Shader Compiler Architecture
+// Copyright (C) 2010-2014 LunarG, Inc.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//     Redistributions of source code must retain the above copyright
+//     notice, this list of conditions and the following disclaimer.
+//
+//     Redistributions in binary form must reproduce the above
+//     copyright notice, this list of conditions and the following
+//     disclaimer in the documentation and/or other materials provided
+//     with the distribution.
+//
+//     Neither the name of LunarG Inc. nor the names of its
+//     contributors may be used to endorse or promote products derived
+//     from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//===----------------------------------------------------------------------===//
+//
+// Author: Michael Ilseman, LunarG
+//
+//===----------------------------------------------------------------------===//
+//
+// Scalarize the IR.
+//   * Loads of uniforms become multiple loadComponent calls
+//
+//   * Reads/writes become read/writeComponent calls
+//
+//   * Component-wise operations become multiple ops over each component
+//
+//   * Texture call become recomponsed texture calls
+//
+//   * Vector ops disappear, with their users referring to the scalarized
+//   * components
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/llvm-config.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
+#else
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#endif
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+
+using namespace llvm;
+
+namespace gbe {
+
+  struct VectorValues {
+    VectorValues() : vals()
+    { }
+
+    void setComponent(int c, llvm::Value* val)
+    {
+      assert(c >= 0 && c < 16 && "Out of bounds component");
+      vals[c] = val;
+    }
+    llvm::Value* getComponent(int c)
+    {
+      assert(c >= 0 && c < 16 && "Out of bounds component");
+      assert(vals[c] && "Requesting non-existing component");
+      return vals[c];
+    }
+
+    // {Value* x, Value* y, Value* z, Value* w}
+    llvm::Value* vals[16];
+  };
+
+  class Scalarize : public FunctionPass {
+
+  public:
+    // Standard pass stuff
+    static char ID;
+
+    Scalarize() : FunctionPass(ID)
+    {
+      initializeLoopInfoPass(*PassRegistry::getPassRegistry());
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+      initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
+#else
+      initializeDominatorTreePass(*PassRegistry::getPassRegistry());
+#endif
+    }
+
+    virtual bool runOnFunction(Function&);
+    void print(raw_ostream&, const Module* = 0) const;
+    virtual void getAnalysisUsage(AnalysisUsage&) const;
+
+  protected:
+    // An instruction is valid post-scalarization iff it is fully scalar or it
+    // is a gla_loadn
+    bool isValid(const Instruction*);
+
+    // Take an instruction that produces a vector, and scalarize it
+    bool scalarize(Instruction*);
+    bool scalarizePerComponent(Instruction*);
+    bool scalarizeBitCast(BitCastInst *);
+    bool scalarizeFuncCall(CallInst *);
+    bool scalarizeLoad(LoadInst*);
+    bool scalarizeStore(StoreInst*);
+    //bool scalarizeIntrinsic(IntrinsicInst*);
+    bool scalarizeExtract(ExtractElementInst*);
+    bool scalarizeInsert(InsertElementInst*);
+    bool scalarizeShuffleVector(ShuffleVectorInst*);
+    bool scalarizePHI(PHINode*);
+    void scalarizeArgs(Function& F);
+    // ...
+
+    // Helpers to make the actual multiple scalar calls, one per
+    // component. Updates the given VectorValues's components with the new
+    // Values.
+    void makeScalarizedCalls(Function*, ArrayRef<Value*>, int numComponents, VectorValues&);
+
+    void makePerComponentScalarizedCalls(Instruction*, ArrayRef<Value*>);
+
+    // Makes a scalar form of the given instruction: replaces the operands
+    // and chooses a correct return type
+    Instruction* createScalarInstruction(Instruction* inst, ArrayRef<Value*>);
+
+    // Gather the specified components in the given values. Returns the
+    // component if the given value is a vector, or the scalar itself.
+    void gatherComponents(int component, ArrayRef<Value*> args, SmallVectorImpl<Value*>& componentArgs);
+
+    // Get the assigned component for that value. If the value is a scalar,
+    // returns the scalar. If it's a constant, returns that component. If
+    // it's an instruction, returns the vectorValues of that instruction for
+    // that component
+    Value* getComponent(int component, Value*);
+
+    // Used for assertion purposes. Whether we can get the component out with
+    // a getComponent call
+    bool canGetComponent(Value*);
+
+    // Used for assertion purposes. Whether for every operand we can get
+    // components with a getComponent call
+    bool canGetComponentArgs(User*);
+
+    // Delete the instruction in the deadList
+    void dce();
+
+
+    int GetConstantInt(const Value* value);
+    bool IsPerComponentOp(const Instruction* inst);
+    bool IsPerComponentOp(const Value* value);
+
+    //these function used to add extract and insert instructions when load/store etc.
+    void extractFromVector(Value* insn);
+    Value* InsertToVector(Value* insn, Value* vecValue);
+
+    Type* GetBasicType(Value* value) {
+      return GetBasicType(value->getType());
+    }
+
+    Type* GetBasicType(Type* type) {
+      switch(type->getTypeID()) {
+      case Type::VectorTyID:
+      case Type::ArrayTyID:
+        return GetBasicType(type->getContainedType(0));
+      default:
+        break;
+      }
+      return type;
+    }
+
+    int GetComponentCount(const Type* type)  {
+      if (type->getTypeID() == Type::VectorTyID)
+        return llvm::dyn_cast<VectorType>(type)->getNumElements();
+      else
+        return 1;
+    }
+
+    int GetComponentCount(const Value* value) {
+      return GetComponentCount(value->getType());
+    }
+
+    /* set to insert new instructions after the specified instruction.*/
+    void setAppendPoint(Instruction *insn)  {
+      BasicBlock::iterator next(insn);
+      builder->SetInsertPoint(++next);
+    }
+
+    DenseMap<Value*, VectorValues> vectorVals;
+    Module* module;
+    IRBuilder<>* builder;
+
+    Type* intTy;
+    Type* floatTy;
+
+    std::vector<Instruction*> deadList;
+
+    // List of vector phis that were not completely scalarized because some
+    // of their operands hadn't before been visited (i.e. loop variant
+    // variables)
+    SmallVector<PHINode*, 16> incompletePhis;
+  };
+
+  Value* Scalarize::getComponent(int component, Value* v)
+  {
+    assert(canGetComponent(v) && "getComponent called on unhandled vector");
+
+    if (v->getType()->isVectorTy()) {
+      if (ConstantDataVector* c = dyn_cast<ConstantDataVector>(v)) {
+        return c->getElementAsConstant(component);
+      } else if (ConstantVector* c = dyn_cast<ConstantVector>(v)) {
+        return c->getOperand(component);
+      } else if (isa<ConstantAggregateZero>(v)) {
+        return Constant::getNullValue(GetBasicType(v));
+      } else if (isa<UndefValue>(v)) {
+        return UndefValue::get(GetBasicType(v));
+      } else {
+        return vectorVals[v].getComponent(component);
+      }
+    } else {
+      return v;
+    }
+  }
+
+  bool IsPerComponentOp(const llvm::Value* value)
+  {
+    const llvm::Instruction* inst = llvm::dyn_cast<const llvm::Instruction>(value);
+    return inst && IsPerComponentOp(inst);
+  }
+
+  bool Scalarize::IsPerComponentOp(const Instruction* inst)
+  {
+    //if (const IntrinsicInst* intr = dyn_cast<const IntrinsicInst>(inst))
+    //    return IsPerComponentOp(intr);
+
+    if (inst->isTerminator())
+        return false;
+
+    switch (inst->getOpcode()) {
+
+    // Cast ops are only per-component if they cast back to the same vector
+    // width
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+      return GetComponentCount(inst->getOperand(0)) == GetComponentCount(inst);
+
+    // Vector ops
+    case Instruction::InsertElement:
+    case Instruction::ExtractElement:
+    case Instruction::ShuffleVector:
+
+    // Ways of accessing/loading/storing vectors
+    case Instruction::ExtractValue:
+    case Instruction::InsertValue:
+
+    // Memory ops
+    case Instruction::Alloca:
+    case Instruction::Load:
+    case Instruction::Store:
+    case Instruction::GetElementPtr:
+    // Phis are a little special. We consider them not to be per-component
+    // because the mechanism of choice is a single value (what path we took to
+    // get here), and doesn't choose per-component (as select would). The caller
+    // should know to handle phis specially
+    case Instruction::PHI:
+    // Call insts, conservatively are no per-component
+    case Instruction::Call:
+    // Misc
+    case Instruction::LandingPad:  //--- 3.0
+    case Instruction::VAArg:
+      return false;
+    } // end of switch (inst->getOpcode())
+
+    return true;
+  }
+  int Scalarize::GetConstantInt(const Value* value)
+  {
+    const ConstantInt *constantInt = dyn_cast<ConstantInt>(value);
+
+    // this might still be a constant expression, rather than a numeric constant,
+    // e.g., expression with undef's in it, so it was not folded
+    if (! constantInt)
+      NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("non-simple constant");
+
+    return constantInt->getValue().getSExtValue();
+  }
+  bool Scalarize::canGetComponent(Value* v)
+  {
+    if (v->getType()->isVectorTy()) {
+      if (isa<ConstantDataVector>(v) || isa<ConstantVector>(v) || isa<ConstantAggregateZero>(v) || isa<UndefValue>(v)) {
+        return true;
+      } else {
+        assert((isa<Instruction>(v) || isa<Argument>(v)) && "Non-constant non-instuction?");
+        return vectorVals.count(v);
+      }
+    } else {
+      return true;
+    }
+  }
+
+  bool Scalarize::canGetComponentArgs(User* u)
+  {
+    if (PHINode* phi = dyn_cast<PHINode>(u)) {
+      for (unsigned int i = 0; i < phi->getNumIncomingValues(); ++i)
+        if (!canGetComponent(phi->getIncomingValue(i)))
+          return false;
+    } else {
+      for (User::op_iterator i = u->op_begin(), e = u->op_end(); i != e; ++i)
+        if (!canGetComponent(*i))
+          return false;
+    }
+    return true;
+  }
+
+  void Scalarize::gatherComponents(int component, ArrayRef<Value*> args, SmallVectorImpl<Value*>& componentArgs)
+  {
+    componentArgs.clear();
+    for (ArrayRef<Value*>::iterator i = args.begin(), e = args.end(); i != e; ++i)
+      componentArgs.push_back(getComponent(component, *i));
+  }
+
+  Instruction* Scalarize::createScalarInstruction(Instruction* inst, ArrayRef<Value*> args)
+  {
+    // TODO: Refine the below into one large switch
+
+    unsigned op = inst->getOpcode();
+    if (inst->isCast()) {
+      assert(args.size() == 1 && "incorrect number of arguments for cast op");
+      return CastInst::Create((Instruction::CastOps)op, args[0], GetBasicType(inst));
+    }
+
+    if (inst->isBinaryOp()) {
+      assert(args.size() == 2 && "incorrect number of arguments for binary op");
+      return BinaryOperator::Create((Instruction::BinaryOps)op, args[0], args[1]);
+    }
+
+    if (PHINode* phi = dyn_cast<PHINode>(inst)) {
+      PHINode* res = PHINode::Create(GetBasicType(inst), phi->getNumIncomingValues());
+
+      // Loop over pairs of operands: [Value*, BasicBlock*]
+      for (unsigned int i = 0; i < args.size(); i++) {
+        BasicBlock* bb = phi->getIncomingBlock(i); //dyn_cast<BasicBlock>(args[i+1]);
+        //assert(bb && "Non-basic block incoming block?");
+        res->addIncoming(args[i], bb);
+      }
+
+      return res;
+    }
+
+    if (CmpInst* cmpInst = dyn_cast<CmpInst>(inst)) {
+      assert(args.size() == 2 && "incorrect number of arguments for comparison");
+      return CmpInst::Create(cmpInst->getOpcode(), cmpInst->getPredicate(), args[0], args[1]);
+    }
+
+    if (isa<SelectInst>(inst)) {
+      assert(args.size() == 3 && "incorrect number of arguments for select");
+      return SelectInst::Create(args[0], args[1], args[2]);
+    }
+
+    if (IntrinsicInst* intr = dyn_cast<IntrinsicInst>(inst)) {
+      if (! IsPerComponentOp(inst))
+        NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Scalarize instruction on a non-per-component intrinsic");
+
+      // TODO: Assumption is that all per-component intrinsics have all their
+      // arguments be overloadable. Need to find some way to assert on this
+      // assumption. This is due to how getDeclaration operates; it only takes
+      // a list of types that fit overloadable slots.
+      SmallVector<Type*, 8> tys(1, GetBasicType(inst->getType()));
+      // Call instructions have the decl as a last argument, so skip it
+      for (ArrayRef<Value*>::iterator i = args.begin(), e = args.end() - 1; i != e; ++i) {
+        tys.push_back(GetBasicType((*i)->getType()));
+      }
+
+      Function* f = Intrinsic::getDeclaration(module, intr->getIntrinsicID(), tys);
+      return CallInst::Create(f, args);
+    }
+
+    NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Currently unsupported instruction: ", inst->getOpcode(),
+                     //             inst->getOpcodeName());
+    return 0;
+
+  }
+
+
+  void Scalarize::makeScalarizedCalls(Function* f, ArrayRef<Value*> args, int count, VectorValues& vVals)
+  {
+    assert(count > 0 && count <= 16 && "invalid number of vector components");
+    for (int i = 0; i < count; ++i) {
+      Value* res;
+      SmallVector<Value*, 8> callArgs(args.begin(), args.end());
+      callArgs.push_back(ConstantInt::get(intTy, i));
+
+      res = builder->CreateCall(f, callArgs);
+      vVals.setComponent(i, res);
+    }
+  }
+
+  void Scalarize::makePerComponentScalarizedCalls(Instruction* inst, ArrayRef<Value*> args)
+  {
+    int count = GetComponentCount(inst);
+    assert(count > 0 && count <= 16 && "invalid number of vector components");
+    assert((inst->getNumOperands() == args.size() || isa<PHINode>(inst))
+           && "not enough arguments passed for instruction");
+
+    VectorValues& vVals = vectorVals[inst];
+
+    for (int i = 0; i < count; ++i) {
+      // Set this component of each arg
+      SmallVector<Value*, 8> callArgs(args.size(), 0);
+      gatherComponents(i, args, callArgs);
+
+      Instruction* res = createScalarInstruction(inst, callArgs);
+
+      vVals.setComponent(i, res);
+      builder->Insert(res);
+    }
+  }
+
+  bool Scalarize::isValid(const Instruction* inst)
+  {
+    // The result
+    if (inst->getType()->isVectorTy())
+        return false;
+
+    // The arguments
+    for (Instruction::const_op_iterator i = inst->op_begin(), e = inst->op_end(); i != e; ++i) {
+      const Value* v = (*i);
+      assert(v);
+      if (v->getType()->isVectorTy())
+        return false;
+    }
+
+    return true;
+  }
+
+  bool Scalarize::scalarize(Instruction* inst)
+  {
+    if (isValid(inst))
+        return false;
+
+    assert(! vectorVals.count(inst) && "We've already scalarized this somehow?");
+    assert((canGetComponentArgs(inst) || isa<PHINode>(inst)) &&
+           "Scalarizing an op whose arguments haven't been scalarized ");
+    builder->SetInsertPoint(inst);
+
+    if (IsPerComponentOp(inst))
+      return scalarizePerComponent(inst);
+
+    //not Per Component bitcast, for example <2 * i8> -> i16, handle it in backend
+    if (BitCastInst* bt = dyn_cast<BitCastInst>(inst))
+      return scalarizeBitCast(bt);
+
+    if (LoadInst* ld = dyn_cast<LoadInst>(inst))
+      return scalarizeLoad(ld);
+
+    if (CallInst* call = dyn_cast<CallInst>(inst))
+      return scalarizeFuncCall(call);
+
+    if (ExtractElementInst* extr = dyn_cast<ExtractElementInst>(inst))
+      return scalarizeExtract(extr);
+
+    if (InsertElementInst* ins = dyn_cast<InsertElementInst>(inst))
+      return scalarizeInsert(ins);
+
+    if (ShuffleVectorInst* sv = dyn_cast<ShuffleVectorInst>(inst))
+      return scalarizeShuffleVector(sv);
+
+    if (PHINode* phi = dyn_cast<PHINode>(inst))
+      return scalarizePHI(phi);
+
+    if (isa<ExtractValueInst>(inst) || isa<InsertValueInst>(inst))
+      // TODO: need to come up with a struct/array model for scalarization
+      NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Scalarizing struct/array ops");
+
+    if (StoreInst* st = dyn_cast<StoreInst>(inst))
+      return scalarizeStore(st);
+
+    NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Currently unhandled instruction ", inst->getOpcode(), inst->getOpcodeName());
+    return false;
+  }
+
+  bool Scalarize::scalarizeShuffleVector(ShuffleVectorInst* sv)
+  {
+    //     %res = shuffleVector <n x ty> %foo, <n x ty> bar, <n x i32> <...>
+    // ==> nothing (just make a new VectorValues with the new components)
+    VectorValues& vVals = vectorVals[sv];
+
+    int size = GetComponentCount(sv);
+    int srcSize = GetComponentCount(sv->getOperand(0)->getType());
+
+    for (int i = 0; i < size; ++i) {
+      int select = sv->getMaskValue(i);
+
+      if (select < 0) {
+        vVals.setComponent(i, UndefValue::get(GetBasicType(sv->getOperand(0))));
+        continue;
+      }
+
+      // Otherwise look up the corresponding component from the correct
+      // source.
+      Value* selectee;
+      if (select < srcSize) {
+        selectee = sv->getOperand(0);
+      } else {
+        // Choose from the second operand
+        select -= srcSize;
+        selectee = sv->getOperand(1);
+      }
+
+      vVals.setComponent(i, getComponent(select, selectee));
+    }
+
+    return true;
+  }
+
+  bool Scalarize::scalarizePerComponent(Instruction* inst)
+  {
+    //     dst  = op <n x ty> %foo, <n x ty> %bar
+    // ==> dstx = op ty %foox, ty %barx
+    //     dsty = op ty %fooy, ty %bary
+    //     ...
+
+    SmallVector<Value*, 16> args(inst->op_begin(), inst->op_end());
+
+    makePerComponentScalarizedCalls(inst, args);
+
+    return true;
+  }
+
+  bool Scalarize::scalarizePHI(PHINode* phi)
+  {
+    //     dst = phi <n x ty> [ %foo, %bb1 ], [ %bar, %bb2], ...
+    // ==> dstx = phi ty [ %foox, %bb1 ], [ %barx, %bb2], ...
+    //     dsty = phi ty [ %fooy, %bb1 ], [ %bary, %bb2], ...
+
+    // If the scalar values are all known up-front, then just make the full
+    // phinode now. If they are not yet known (phinode for a loop variant
+    // variable), then deferr the arguments until later
+
+    if (canGetComponentArgs(phi)) {
+      SmallVector<Value*, 8> args(phi->op_begin(), phi->op_end());
+      makePerComponentScalarizedCalls(phi, args);
+    } else {
+      makePerComponentScalarizedCalls(phi, ArrayRef<Value*>());
+      incompletePhis.push_back(phi);
+    }
+
+    return true;
+  }
+
+  void Scalarize::extractFromVector(Value* insn) {
+    VectorValues& vVals = vectorVals[insn];
+
+    for (int i = 0; i < GetComponentCount(insn); ++i) {
+      Value *cv = ConstantInt::get(intTy, i);
+      Value *EI = builder->CreateExtractElement(insn, cv);
+      vVals.setComponent(i, EI);
+    }
+  }
+
+  Value* Scalarize::InsertToVector(Value * insn, Value* vecValue) {
+    //VectorValues& vVals = vectorVals[writeValue];
+
+    //add fake insert instructions to avoid removed
+    Value *II = NULL;
+    for (int i = 0; i < GetComponentCount(vecValue); ++i) {
+      Value *vec = II ? II : UndefValue::get(vecValue->getType());
+      Value *cv = ConstantInt::get(intTy, i);
+      II = builder->CreateInsertElement(vec, getComponent(i, vecValue), cv);
+    }
+
+    return II;
+  }
+
+  bool Scalarize::scalarizeFuncCall(CallInst* call) {
+    if (Function *F = call->getCalledFunction()) {
+      if (F->getIntrinsicID() != 0) {   //Intrinsic functions
+        NOT_IMPLEMENTED;
+      } else {
+        Value *Callee = call->getCalledValue();
+        const std::string fnName = Callee->getName();
+        auto it = instrinsicMap.map.find(fnName);
+        GBE_ASSERT(it != instrinsicMap.map.end());
+
+        // Get the function arguments
+        CallSite CS(call);
+        CallSite::arg_iterator CI = CS.arg_begin() + 2;
+
+        switch (it->second) {
+          default: break;
+          case GEN_OCL_READ_IMAGE_I_1D:
+          case GEN_OCL_READ_IMAGE_UI_1D:
+          case GEN_OCL_READ_IMAGE_F_1D:
+          case GEN_OCL_READ_IMAGE_I_2D:
+          case GEN_OCL_READ_IMAGE_UI_2D:
+          case GEN_OCL_READ_IMAGE_F_2D:
+          case GEN_OCL_READ_IMAGE_I_3D:
+          case GEN_OCL_READ_IMAGE_UI_3D:
+          case GEN_OCL_READ_IMAGE_F_3D:
+
+	  case GEN_OCL_READ_IMAGE_I_1D_I:
+          case GEN_OCL_READ_IMAGE_UI_1D_I:
+          case GEN_OCL_READ_IMAGE_F_1D_I:
+          case GEN_OCL_READ_IMAGE_I_2D_I:
+          case GEN_OCL_READ_IMAGE_UI_2D_I:
+          case GEN_OCL_READ_IMAGE_F_2D_I:
+          case GEN_OCL_READ_IMAGE_I_3D_I:
+          case GEN_OCL_READ_IMAGE_UI_3D_I:
+          case GEN_OCL_READ_IMAGE_F_3D_I:
+          case GEN_OCL_GET_IMAGE_WIDTH:
+          case GEN_OCL_GET_IMAGE_HEIGHT:
+          {
+            setAppendPoint(call);
+            extractFromVector(call);
+            break;
+          }
+          case GEN_OCL_WRITE_IMAGE_I_3D:
+          case GEN_OCL_WRITE_IMAGE_UI_3D:
+          case GEN_OCL_WRITE_IMAGE_F_3D:
+            CI++;
+          case GEN_OCL_WRITE_IMAGE_I_2D:
+          case GEN_OCL_WRITE_IMAGE_UI_2D:
+          case GEN_OCL_WRITE_IMAGE_F_2D:
+            CI++;
+          case GEN_OCL_WRITE_IMAGE_I_1D:
+          case GEN_OCL_WRITE_IMAGE_UI_1D:
+          case GEN_OCL_WRITE_IMAGE_F_1D:
+          {
+            *CI = InsertToVector(call, *CI);
+            break;
+          }
+        }
+      }
+    }
+    return false;
+  }
+
+  bool Scalarize::scalarizeBitCast(BitCastInst* bt)
+  {
+    if(bt->getOperand(0)->getType()->isVectorTy())
+      bt->setOperand(0, InsertToVector(bt, bt->getOperand(0)));
+    if(bt->getType()->isVectorTy()) {
+      setAppendPoint(bt);
+      extractFromVector(bt);
+    }
+    return false;
+  }
+
+  bool Scalarize::scalarizeLoad(LoadInst* ld)
+  {
+    setAppendPoint(ld);
+    extractFromVector(ld);
+    return false;
+  }
+
+  bool Scalarize::scalarizeStore(StoreInst* st) {
+    st->setOperand(0, InsertToVector(st, st->getValueOperand()));
+    return false;
+  }
+
+  bool Scalarize::scalarizeExtract(ExtractElementInst* extr)
+  {
+    //     %res = extractelement <n X ty> %foo, %i
+    // ==> nothing (just use %foo's %ith component instead of %res)
+
+    if (! isa<Constant>(extr->getOperand(1))) {
+        // TODO: Variably referenced components. Probably handle/emulate through
+        // a series of selects.
+        NOT_IMPLEMENTED; //gla::UnsupportedFunctionality("Variably referenced vector components");
+    }
+    //if (isa<Argument>(extr->getOperand(0)))
+    //  return false;
+    int component = GetConstantInt(extr->getOperand(1));
+    Value* v = getComponent(component, extr->getOperand(0));
+    if(extr == v)
+      return false;
+    extr->replaceAllUsesWith(v);
+
+    return true;
+  }
+
+  bool Scalarize::scalarizeInsert(InsertElementInst* ins)
+  {
+    //     %res = insertValue <n x ty> %foo, %i
+    // ==> nothing (just make a new VectorValues with the new component)
+
+    if (! isa<Constant>(ins->getOperand(2))) {
+      // TODO: Variably referenced components. Probably handle/emulate through
+      // a series of selects.
+      NOT_IMPLEMENTED;   //gla::UnsupportedFunctionality("Variably referenced vector components");
+    }
+
+    int component = GetConstantInt(ins->getOperand(2));
+
+    VectorValues& vVals = vectorVals[ins];
+    for (int i = 0; i < GetComponentCount(ins); ++i) {
+      vVals.setComponent(i, i == component ? ins->getOperand(1)
+                                           : getComponent(i, ins->getOperand(0)));
+    }
+
+    return true;
+  }
+
+  void Scalarize::scalarizeArgs(Function& F)  {
+    if (F.arg_empty())
+      return;
+    ReversePostOrderTraversal<Function*> rpot(&F);
+    BasicBlock::iterator instI = (*rpot.begin())->begin();
+    builder->SetInsertPoint(instI);
+
+    Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
+
+    for (; I != E; ++I) {
+      Type *type = I->getType();
+
+      if(type->isVectorTy())
+        extractFromVector(I);
+    }
+    return;
+  }
+
+  bool Scalarize::runOnFunction(Function& F)
+  {
+    switch (F.getCallingConv()) {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+    case CallingConv::PTX_Device:
+      return false;
+    case CallingConv::PTX_Kernel:
+#else
+    case CallingConv::C:
+#endif
+      break;
+    default: GBE_ASSERTM(false, "Unsupported calling convention");
+    }
+
+    // As we inline all function calls, so skip non-kernel functions
+    bool bKernel = isKernelFunction(F);
+    if(!bKernel) return false;
+
+    bool changed = false;
+    module = F.getParent();
+    intTy = IntegerType::get(module->getContext(), 32);
+    floatTy = Type::getFloatTy(module->getContext());
+    builder = new IRBuilder<>(module->getContext());
+
+    scalarizeArgs(F);
+    typedef ReversePostOrderTraversal<Function*> RPOTType;
+    RPOTType rpot(&F);
+    for (RPOTType::rpo_iterator bbI = rpot.begin(), bbE = rpot.end(); bbI != bbE; ++bbI) {
+      for (BasicBlock::iterator instI = (*bbI)->begin(), instE = (*bbI)->end(); instI != instE; ++instI) {
+        bool scalarized = scalarize(instI);
+        if (scalarized) {
+          changed = true;
+          // TODO: uncomment when done
+          deadList.push_back(instI);
+        }
+      }
+    }
+
+    // Fill in the incomplete phis
+    for (SmallVectorImpl<PHINode*>::iterator phiI = incompletePhis.begin(), phiE = incompletePhis.end();
+       phiI != phiE; ++phiI) {
+      assert(canGetComponentArgs(*phiI) && "Phi's operands never scalarized");
+      // Fill in each component of this phi
+      VectorValues& vVals = vectorVals[*phiI];
+      for (int c = 0; c < GetComponentCount(*phiI); ++c) {
+        PHINode* compPhi = dyn_cast<PHINode>(vVals.getComponent(c));
+        assert(compPhi && "Vector phi got scalarized to non-phis?");
+
+        // Loop over pairs of operands: [Value*, BasicBlock*]
+        for (unsigned int i = 0; i < (*phiI)->getNumOperands(); i++) {
+          BasicBlock* bb = (*phiI)->getIncomingBlock(i);
+          assert(bb && "Non-basic block incoming block?");
+          compPhi->addIncoming(getComponent(c, (*phiI)->getOperand(i)), bb);
+        }
+      }
+    }
+
+    dce();
+
+    incompletePhis.clear();
+    vectorVals.clear();
+
+    delete builder;
+    builder = 0;
+
+    return changed;
+  }
+
+  void Scalarize::dce()
+  {
+    //two passes delete for some phinode
+    for (std::vector<Instruction*>::reverse_iterator i = deadList.rbegin(), e = deadList.rend(); i != e; ++i) {
+      (*i)->dropAllReferences();
+      if((*i)->use_empty()) {
+        (*i)->eraseFromParent();
+        (*i) = NULL;
+      }
+    }
+    for (std::vector<Instruction*>::reverse_iterator i = deadList.rbegin(), e = deadList.rend(); i != e; ++i) {
+      if((*i) && (*i)->getParent())
+        (*i)->eraseFromParent();
+    }
+    deadList.clear();
+  }
+
+  void Scalarize::getAnalysisUsage(AnalysisUsage& AU) const
+  {
+  }
+
+  void Scalarize::print(raw_ostream&, const Module*) const
+  {
+      return;
+  }
+  FunctionPass* createScalarizePass()
+  {
+    return new Scalarize();
+  }
+  char Scalarize::ID = 0;
+
+} // end namespace
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
new file mode 100644
index 0000000..84ba383
--- /dev/null
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file llvm_to_gen.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/DataLayout.h"
+#else
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/DataLayout.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/ADT/Triple.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
+#include "llvm/Support/IRReader.h"
+#else
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/SourceMgr.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Verifier.h"
+#else
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#endif
+
+#include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/llvm_gen_backend.hpp"
+#include "llvm/llvm_to_gen.hpp"
+#include "sys/cvar.hpp"
+#include "sys/platform.hpp"
+
+#include <clang/CodeGen/CodeGenAction.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <memory>
+
+namespace gbe
+{
+  BVAR(OCL_OUTPUT_LLVM, false);
+  BVAR(OCL_OUTPUT_CFG, false);
+  BVAR(OCL_OUTPUT_CFG_ONLY, false);
+  BVAR(OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS, false);
+  using namespace llvm;
+
+  void runFuntionPass(Module &mod, TargetLibraryInfo *libraryInfo, const DataLayout &DL)
+  {
+    FunctionPassManager FPM(&mod);
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+    FPM.add(new DataLayoutPass(DL));
+#else
+    FPM.add(new DataLayout(DL));
+#endif
+
+    // XXX remove the verifier pass to workaround a non-fatal error.
+    // add this pass cause the Clang abort with the following error message:
+    // "Global is external, but doesn't have external or weak linkage"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
+    //FPM.add(createVerifierPass(true));
+#else
+    //FPM.add(createVerifierPass());
+#endif
+    FPM.add(new TargetLibraryInfo(*libraryInfo));
+    FPM.add(createTypeBasedAliasAnalysisPass());
+    FPM.add(createBasicAliasAnalysisPass());
+    FPM.add(createCFGSimplificationPass());
+    FPM.add(createSROAPass());
+    FPM.add(createEarlyCSEPass());
+    FPM.add(createLowerExpectIntrinsicPass());
+
+    FPM.doInitialization();
+    for (Module::iterator I = mod.begin(),
+           E = mod.end(); I != E; ++I)
+      if (!I->isDeclaration())
+        FPM.run(*I);
+    FPM.doFinalization();
+  }
+
+  void runModulePass(Module &mod, TargetLibraryInfo *libraryInfo, const DataLayout &DL, int optLevel)
+  {
+    llvm::PassManager MPM;
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+    MPM.add(new DataLayoutPass(DL));
+#else
+    MPM.add(new DataLayout(DL));
+#endif
+    MPM.add(new TargetLibraryInfo(*libraryInfo));
+    MPM.add(createTypeBasedAliasAnalysisPass());
+    MPM.add(createBasicAliasAnalysisPass());
+    MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
+
+    MPM.add(createIPSCCPPass());              // IP SCCP
+    MPM.add(createDeadArgEliminationPass());  // Dead argument elimination
+
+    MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
+    MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
+    MPM.add(createPruneEHPass());             // Remove dead EH info
+    MPM.add(createBarrierNodupPass(false));   // remove noduplicate fnAttr before inlining.
+    MPM.add(createFunctionInliningPass(200000));
+    MPM.add(createBarrierNodupPass(true));    // restore noduplicate fnAttr after inlining.
+    MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
+
+    //MPM.add(createScalarReplAggregatesPass(64, true, -1, -1, 64))
+    if(optLevel > 0)
+      MPM.add(createSROAPass(/*RequiresDomTree*/ false));
+    MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+    MPM.add(createJumpThreadingPass());         // Thread jumps.
+    MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+    MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+    MPM.add(createInstructionCombiningPass());  // Combine silly seq's
+
+    MPM.add(createTailCallEliminationPass());   // Eliminate tail calls
+    MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+    MPM.add(createReassociatePass());           // Reassociate expressions
+    MPM.add(createLoopRotatePass());            // Rotate Loop
+    MPM.add(createLICMPass());                  // Hoist loop invariants
+    MPM.add(createLoopUnswitchPass(true));
+    MPM.add(createInstructionCombiningPass());
+    MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
+    MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
+    MPM.add(createLoopDeletionPass());          // Delete dead loops
+    MPM.add(createLoopUnrollPass());          // Unroll small loops
+    if(optLevel > 0)
+      MPM.add(createGVNPass());                 // Remove redundancies
+    MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
+    MPM.add(createSCCPPass());                  // Constant prop with SCCP
+
+    // Run instcombine after redundancy elimination to exploit opportunities
+    // opened up by them.
+    MPM.add(createInstructionCombiningPass());
+    MPM.add(createJumpThreadingPass());         // Thread jumps
+    MPM.add(createCorrelatedValuePropagationPass());
+    MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
+    MPM.add(createAggressiveDCEPass());         // Delete dead instructions
+    MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
+    MPM.add(createInstructionCombiningPass());  // Clean up after everything.
+    MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
+    if(optLevel > 0) {
+      MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.
+      MPM.add(createConstantMergePass());     // Merge dup global constants
+    }
+
+    MPM.run(mod);
+  }
+
+  bool llvmToGen(ir::Unit &unit, const char *fileName,const void* module, int optLevel)
+  {
+    std::string errInfo;
+    std::unique_ptr<llvm::raw_fd_ostream> o = NULL;
+    if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS || OCL_OUTPUT_LLVM)
+      o = std::unique_ptr<llvm::raw_fd_ostream>(new llvm::raw_fd_ostream(fileno(stdout), false));
+
+    // Get the module from its file
+    llvm::SMDiagnostic Err;
+    std::auto_ptr<Module> M;
+    if(fileName){
+      // only when module is null, Get the global LLVM context
+      llvm::LLVMContext& c = llvm::getGlobalContext();
+      M.reset(ParseIRFile(fileName, Err, c));
+      if (M.get() == 0) return false;
+    }
+    Module &mod = (module!=NULL)?*(llvm::Module*)module:*M.get();
+    DataLayout DL(&mod);
+
+    Triple TargetTriple(mod.getTargetTriple());
+    TargetLibraryInfo *libraryInfo = new TargetLibraryInfo(TargetTriple);
+    libraryInfo->disableAllFunctions();
+
+    runFuntionPass(mod, libraryInfo, DL);
+    runModulePass(mod, libraryInfo, DL, optLevel);
+
+    llvm::PassManager passes;
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+    passes.add(new DataLayoutPass(DL));
+#else
+    passes.add(new DataLayout(DL));
+#endif
+    // Print the code before further optimizations
+    if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+      passes.add(createPrintModulePass(*o));
+#else
+      passes.add(createPrintModulePass(&*o));
+#endif
+    passes.add(createIntrinsicLoweringPass());
+    passes.add(createFunctionInliningPass(200000));
+    passes.add(createScalarReplAggregatesPass(64, true, -1, -1, 64));
+    passes.add(createLoadStoreOptimizationPass());
+    passes.add(createRemoveGEPPass(unit));
+    passes.add(createConstantPropagationPass());
+    passes.add(createLowerSwitchPass());
+    passes.add(createPromoteMemoryToRegisterPass());
+    if(optLevel > 0)
+      passes.add(createGVNPass());                  // Remove redundancies
+    passes.add(createPrintfParserPass());
+    passes.add(createScalarizePass());        // Expand all vector ops
+    passes.add(createDeadInstEliminationPass());  // Remove simplified instructions
+    passes.add(createCFGSimplificationPass());     // Merge & remove BBs
+    passes.add(createScalarizePass());        // Expand all vector ops
+
+    if(OCL_OUTPUT_CFG)
+      passes.add(createCFGPrinterPass());
+    if(OCL_OUTPUT_CFG_ONLY)
+      passes.add(createCFGOnlyPrinterPass());
+    passes.add(createGenPass(unit));
+
+    // Print the code extra optimization passes
+    if (OCL_OUTPUT_LLVM)
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+      passes.add(createPrintModulePass(*o));
+#else
+      passes.add(createPrintModulePass(&*o));
+#endif
+    passes.run(mod);
+    return true;
+  }
+} /* namespace gbe */
diff --git a/backend/src/llvm/llvm_to_gen.hpp b/backend/src/llvm/llvm_to_gen.hpp
new file mode 100644
index 0000000..41e3477
--- /dev/null
+++ b/backend/src/llvm/llvm_to_gen.hpp
@@ -0,0 +1,40 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file llvm_to_gen.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_IR_LLVM_TO_GEN_HPP__
+#define __GBE_IR_LLVM_TO_GEN_HPP__
+
+namespace gbe {
+  namespace ir {
+    // The code is output into an IR unit
+    class Unit;
+  } /* namespace ir */
+
+  /*! Convert the LLVM IR code to a GEN IR code,
+		  optLevel 0 equal to clang -O1 and 1 equal to clang -O2*/
+  bool llvmToGen(ir::Unit &unit, const char *fileName, const void* module, int optLevel);
+
+} /* namespace gbe */
+
+#endif /* __GBE_IR_LLVM_TO_GEN_HPP__ */
+
diff --git a/backend/src/ocl_as.h b/backend/src/ocl_as.h
new file mode 100644
index 0000000..692e892
--- /dev/null
+++ b/backend/src/ocl_as.h
@@ -0,0 +1,3086 @@
+// This file is autogenerated by gen_as.sh.
+// Don't modify it manually.
+union _type_cast_1_b {
+  char _char;
+  uchar _uchar;
+};
+
+INLINE OVERLOADABLE uchar as_uchar(char v) {
+  union _type_cast_1_b u;
+  u._char = v;
+  return u._uchar;
+}
+
+INLINE OVERLOADABLE char as_char(uchar v) {
+  union _type_cast_1_b u;
+  u._uchar = v;
+  return u._char;
+}
+
+union _type_cast_2_b {
+  short _short;
+  ushort _ushort;
+  char2 _char2;
+  uchar2 _uchar2;
+};
+
+INLINE OVERLOADABLE ushort as_ushort(short v) {
+  union _type_cast_2_b u;
+  u._short = v;
+  return u._ushort;
+}
+
+INLINE OVERLOADABLE char2 as_char2(short v) {
+  union _type_cast_2_b u;
+  u._short = v;
+  return u._char2;
+}
+
+INLINE OVERLOADABLE uchar2 as_uchar2(short v) {
+  union _type_cast_2_b u;
+  u._short = v;
+  return u._uchar2;
+}
+
+INLINE OVERLOADABLE short as_short(ushort v) {
+  union _type_cast_2_b u;
+  u._ushort = v;
+  return u._short;
+}
+
+INLINE OVERLOADABLE char2 as_char2(ushort v) {
+  union _type_cast_2_b u;
+  u._ushort = v;
+  return u._char2;
+}
+
+INLINE OVERLOADABLE uchar2 as_uchar2(ushort v) {
+  union _type_cast_2_b u;
+  u._ushort = v;
+  return u._uchar2;
+}
+
+INLINE OVERLOADABLE short as_short(char2 v) {
+  union _type_cast_2_b u;
+  u._char2 = v;
+  return u._short;
+}
+
+INLINE OVERLOADABLE ushort as_ushort(char2 v) {
+  union _type_cast_2_b u;
+  u._char2 = v;
+  return u._ushort;
+}
+
+INLINE OVERLOADABLE uchar2 as_uchar2(char2 v) {
+  union _type_cast_2_b u;
+  u._char2 = v;
+  return u._uchar2;
+}
+
+INLINE OVERLOADABLE short as_short(uchar2 v) {
+  union _type_cast_2_b u;
+  u._uchar2 = v;
+  return u._short;
+}
+
+INLINE OVERLOADABLE ushort as_ushort(uchar2 v) {
+  union _type_cast_2_b u;
+  u._uchar2 = v;
+  return u._ushort;
+}
+
+INLINE OVERLOADABLE char2 as_char2(uchar2 v) {
+  union _type_cast_2_b u;
+  u._uchar2 = v;
+  return u._char2;
+}
+
+union _type_cast_4_b {
+  int _int;
+  uint _uint;
+  short2 _short2;
+  ushort2 _ushort2;
+  char3 _char3;
+  char4 _char4;
+  uchar3 _uchar3;
+  uchar4 _uchar4;
+  float _float;
+};
+
+INLINE OVERLOADABLE uint as_uint(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(int v) {
+  union _type_cast_4_b u;
+  u._int = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE short2 as_short2(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(uint v) {
+  union _type_cast_4_b u;
+  u._uint = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(short2 v) {
+  union _type_cast_4_b u;
+  u._short2 = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(ushort2 v) {
+  union _type_cast_4_b u;
+  u._ushort2 = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(char3 v) {
+  union _type_cast_4_b u;
+  u._char3 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(char3 v) {
+  union _type_cast_4_b u;
+  u._char3 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(char3 v) {
+  union _type_cast_4_b u;
+  u._char3 = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(char3 v) {
+  union _type_cast_4_b u;
+  u._char3 = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(char3 v) {
+  union _type_cast_4_b u;
+  u._char3 = v;
+  return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(char3 v) {
+  union _type_cast_4_b u;
+  u._char3 = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(char3 v) {
+  union _type_cast_4_b u;
+  u._char3 = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._uchar4;
+}
+
+INLINE OVERLOADABLE float as_float(char4 v) {
+  union _type_cast_4_b u;
+  u._char4 = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(uchar3 v) {
+  union _type_cast_4_b u;
+  u._uchar3 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(uchar3 v) {
+  union _type_cast_4_b u;
+  u._uchar3 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(uchar3 v) {
+  union _type_cast_4_b u;
+  u._uchar3 = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(uchar3 v) {
+  union _type_cast_4_b u;
+  u._uchar3 = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(uchar3 v) {
+  union _type_cast_4_b u;
+  u._uchar3 = v;
+  return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(uchar3 v) {
+  union _type_cast_4_b u;
+  u._uchar3 = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE float as_float(uchar3 v) {
+  union _type_cast_4_b u;
+  u._uchar3 = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE float as_float(uchar4 v) {
+  union _type_cast_4_b u;
+  u._uchar4 = v;
+  return u._float;
+}
+
+INLINE OVERLOADABLE int as_int(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._int;
+}
+
+INLINE OVERLOADABLE uint as_uint(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._uint;
+}
+
+INLINE OVERLOADABLE short2 as_short2(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._short2;
+}
+
+INLINE OVERLOADABLE ushort2 as_ushort2(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._ushort2;
+}
+
+INLINE OVERLOADABLE char3 as_char3(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._char3;
+}
+
+INLINE OVERLOADABLE char4 as_char4(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._char4;
+}
+
+INLINE OVERLOADABLE uchar3 as_uchar3(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._uchar3;
+}
+
+INLINE OVERLOADABLE uchar4 as_uchar4(float v) {
+  union _type_cast_4_b u;
+  u._float = v;
+  return u._uchar4;
+}
+
+union _type_cast_8_b {
+  long _long;
+  ulong _ulong;
+  int2 _int2;
+  uint2 _uint2;
+  short3 _short3;
+  short4 _short4;
+  ushort3 _ushort3;
+  ushort4 _ushort4;
+  char8 _char8;
+  uchar8 _uchar8;
+  double _double;
+  float2 _float2;
+};
+
+INLINE OVERLOADABLE ulong as_ulong(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(long v) {
+  union _type_cast_8_b u;
+  u._long = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE int2 as_int2(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(ulong v) {
+  union _type_cast_8_b u;
+  u._ulong = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(int2 v) {
+  union _type_cast_8_b u;
+  u._int2 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(uint2 v) {
+  union _type_cast_8_b u;
+  u._uint2 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(short3 v) {
+  union _type_cast_8_b u;
+  u._short3 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(short3 v) {
+  union _type_cast_8_b u;
+  u._short3 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(short3 v) {
+  union _type_cast_8_b u;
+  u._short3 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(short3 v) {
+  union _type_cast_8_b u;
+  u._short3 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(short3 v) {
+  union _type_cast_8_b u;
+  u._short3 = v;
+  return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(short3 v) {
+  union _type_cast_8_b u;
+  u._short3 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(short3 v) {
+  union _type_cast_8_b u;
+  u._short3 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(short3 v) {
+  union _type_cast_8_b u;
+  u._short3 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(short3 v) {
+  union _type_cast_8_b u;
+  u._short3 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(short3 v) {
+  union _type_cast_8_b u;
+  u._short3 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(short4 v) {
+  union _type_cast_8_b u;
+  u._short4 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(ushort3 v) {
+  union _type_cast_8_b u;
+  u._ushort3 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(ushort4 v) {
+  union _type_cast_8_b u;
+  u._ushort4 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(char8 v) {
+  union _type_cast_8_b u;
+  u._char8 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE double as_double(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._double;
+}
+
+INLINE OVERLOADABLE float2 as_float2(uchar8 v) {
+  union _type_cast_8_b u;
+  u._uchar8 = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE float2 as_float2(double v) {
+  union _type_cast_8_b u;
+  u._double = v;
+  return u._float2;
+}
+
+INLINE OVERLOADABLE long as_long(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._long;
+}
+
+INLINE OVERLOADABLE ulong as_ulong(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._ulong;
+}
+
+INLINE OVERLOADABLE int2 as_int2(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._int2;
+}
+
+INLINE OVERLOADABLE uint2 as_uint2(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._uint2;
+}
+
+INLINE OVERLOADABLE short3 as_short3(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._short3;
+}
+
+INLINE OVERLOADABLE short4 as_short4(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._short4;
+}
+
+INLINE OVERLOADABLE ushort3 as_ushort3(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._ushort3;
+}
+
+INLINE OVERLOADABLE ushort4 as_ushort4(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._ushort4;
+}
+
+INLINE OVERLOADABLE char8 as_char8(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._char8;
+}
+
+INLINE OVERLOADABLE uchar8 as_uchar8(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._uchar8;
+}
+
+INLINE OVERLOADABLE double as_double(float2 v) {
+  union _type_cast_8_b u;
+  u._float2 = v;
+  return u._double;
+}
+
+union _type_cast_16_b {
+  long2 _long2;
+  ulong2 _ulong2;
+  int3 _int3;
+  int4 _int4;
+  uint3 _uint3;
+  uint4 _uint4;
+  short8 _short8;
+  ushort8 _ushort8;
+  char16 _char16;
+  uchar16 _uchar16;
+  double2 _double2;
+  float3 _float3;
+  float4 _float4;
+};
+
+INLINE OVERLOADABLE ulong2 as_ulong2(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(long2 v) {
+  union _type_cast_16_b u;
+  u._long2 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(ulong2 v) {
+  union _type_cast_16_b u;
+  u._ulong2 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(int3 v) {
+  union _type_cast_16_b u;
+  u._int3 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(int4 v) {
+  union _type_cast_16_b u;
+  u._int4 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(uint3 v) {
+  union _type_cast_16_b u;
+  u._uint3 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(uint3 v) {
+  union _type_cast_16_b u;
+  u._uint3 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(uint3 v) {
+  union _type_cast_16_b u;
+  u._uint3 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(uint3 v) {
+  union _type_cast_16_b u;
+  u._uint3 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(uint3 v) {
+  union _type_cast_16_b u;
+  u._uint3 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(uint3 v) {
+  union _type_cast_16_b u;
+  u._uint3 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(uint3 v) {
+  union _type_cast_16_b u;
+  u._uint3 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(uint3 v) {
+  union _type_cast_16_b u;
+  u._uint3 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(uint3 v) {
+  union _type_cast_16_b u;
+  u._uint3 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(uint3 v) {
+  union _type_cast_16_b u;
+  u._uint3 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(uint3 v) {
+  union _type_cast_16_b u;
+  u._uint3 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(uint4 v) {
+  union _type_cast_16_b u;
+  u._uint4 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(short8 v) {
+  union _type_cast_16_b u;
+  u._short8 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(ushort8 v) {
+  union _type_cast_16_b u;
+  u._ushort8 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(char16 v) {
+  union _type_cast_16_b u;
+  u._char16 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE float3 as_float3(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(uchar16 v) {
+  union _type_cast_16_b u;
+  u._uchar16 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE float3 as_float3(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._float3;
+}
+
+INLINE OVERLOADABLE float4 as_float4(double2 v) {
+  union _type_cast_16_b u;
+  u._double2 = v;
+  return u._float4;
+}
+
+INLINE OVERLOADABLE long2 as_long2(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(float3 v) {
+  union _type_cast_16_b u;
+  u._float3 = v;
+  return u._double2;
+}
+
+INLINE OVERLOADABLE long2 as_long2(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._long2;
+}
+
+INLINE OVERLOADABLE ulong2 as_ulong2(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._ulong2;
+}
+
+INLINE OVERLOADABLE int3 as_int3(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._int3;
+}
+
+INLINE OVERLOADABLE int4 as_int4(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._int4;
+}
+
+INLINE OVERLOADABLE uint3 as_uint3(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._uint3;
+}
+
+INLINE OVERLOADABLE uint4 as_uint4(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._uint4;
+}
+
+INLINE OVERLOADABLE short8 as_short8(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._short8;
+}
+
+INLINE OVERLOADABLE ushort8 as_ushort8(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._ushort8;
+}
+
+INLINE OVERLOADABLE char16 as_char16(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._char16;
+}
+
+INLINE OVERLOADABLE uchar16 as_uchar16(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._uchar16;
+}
+
+INLINE OVERLOADABLE double2 as_double2(float4 v) {
+  union _type_cast_16_b u;
+  u._float4 = v;
+  return u._double2;
+}
+
+union _type_cast_32_b {
+  long3 _long3;
+  long4 _long4;
+  ulong3 _ulong3;
+  ulong4 _ulong4;
+  int8 _int8;
+  uint8 _uint8;
+  short16 _short16;
+  ushort16 _ushort16;
+  double3 _double3;
+  double4 _double4;
+  float8 _float8;
+};
+
+INLINE OVERLOADABLE ulong3 as_ulong3(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(long3 v) {
+  union _type_cast_32_b u;
+  u._long3 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(long4 v) {
+  union _type_cast_32_b u;
+  u._long4 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(ulong3 v) {
+  union _type_cast_32_b u;
+  u._ulong3 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(ulong4 v) {
+  union _type_cast_32_b u;
+  u._ulong4 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(int8 v) {
+  union _type_cast_32_b u;
+  u._int8 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(uint8 v) {
+  union _type_cast_32_b u;
+  u._uint8 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(short16 v) {
+  union _type_cast_32_b u;
+  u._short16 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._double4;
+}
+
+INLINE OVERLOADABLE float8 as_float8(ushort16 v) {
+  union _type_cast_32_b u;
+  u._ushort16 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE float8 as_float8(double3 v) {
+  union _type_cast_32_b u;
+  u._double3 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE float8 as_float8(double4 v) {
+  union _type_cast_32_b u;
+  u._double4 = v;
+  return u._float8;
+}
+
+INLINE OVERLOADABLE long3 as_long3(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._long3;
+}
+
+INLINE OVERLOADABLE long4 as_long4(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._long4;
+}
+
+INLINE OVERLOADABLE ulong3 as_ulong3(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._ulong3;
+}
+
+INLINE OVERLOADABLE ulong4 as_ulong4(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._ulong4;
+}
+
+INLINE OVERLOADABLE int8 as_int8(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._int8;
+}
+
+INLINE OVERLOADABLE uint8 as_uint8(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._uint8;
+}
+
+INLINE OVERLOADABLE short16 as_short16(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._short16;
+}
+
+INLINE OVERLOADABLE ushort16 as_ushort16(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._ushort16;
+}
+
+INLINE OVERLOADABLE double3 as_double3(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._double3;
+}
+
+INLINE OVERLOADABLE double4 as_double4(float8 v) {
+  union _type_cast_32_b u;
+  u._float8 = v;
+  return u._double4;
+}
+
+union _type_cast_64_b {
+  long8 _long8;
+  ulong8 _ulong8;
+  int16 _int16;
+  uint16 _uint16;
+  double8 _double8;
+  float16 _float16;
+};
+
+INLINE OVERLOADABLE ulong8 as_ulong8(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(long8 v) {
+  union _type_cast_64_b u;
+  u._long8 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(ulong8 v) {
+  union _type_cast_64_b u;
+  u._ulong8 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(int16 v) {
+  union _type_cast_64_b u;
+  u._int16 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._double8;
+}
+
+INLINE OVERLOADABLE float16 as_float16(uint16 v) {
+  union _type_cast_64_b u;
+  u._uint16 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE float16 as_float16(double8 v) {
+  union _type_cast_64_b u;
+  u._double8 = v;
+  return u._float16;
+}
+
+INLINE OVERLOADABLE long8 as_long8(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._long8;
+}
+
+INLINE OVERLOADABLE ulong8 as_ulong8(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._ulong8;
+}
+
+INLINE OVERLOADABLE int16 as_int16(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._int16;
+}
+
+INLINE OVERLOADABLE uint16 as_uint16(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._uint16;
+}
+
+INLINE OVERLOADABLE double8 as_double8(float16 v) {
+  union _type_cast_64_b u;
+  u._float16 = v;
+  return u._double8;
+}
+
+union _type_cast_128_b {
+  long16 _long16;
+  ulong16 _ulong16;
+  double16 _double16;
+};
+
+INLINE OVERLOADABLE ulong16 as_ulong16(long16 v) {
+  union _type_cast_128_b u;
+  u._long16 = v;
+  return u._ulong16;
+}
+
+INLINE OVERLOADABLE double16 as_double16(long16 v) {
+  union _type_cast_128_b u;
+  u._long16 = v;
+  return u._double16;
+}
+
+INLINE OVERLOADABLE long16 as_long16(ulong16 v) {
+  union _type_cast_128_b u;
+  u._ulong16 = v;
+  return u._long16;
+}
+
+INLINE OVERLOADABLE double16 as_double16(ulong16 v) {
+  union _type_cast_128_b u;
+  u._ulong16 = v;
+  return u._double16;
+}
+
+INLINE OVERLOADABLE long16 as_long16(double16 v) {
+  union _type_cast_128_b u;
+  u._double16 = v;
+  return u._long16;
+}
+
+INLINE OVERLOADABLE ulong16 as_ulong16(double16 v) {
+  union _type_cast_128_b u;
+  u._double16 = v;
+  return u._ulong16;
+}
+
diff --git a/backend/src/ocl_barrier.ll b/backend/src/ocl_barrier.ll
new file mode 100644
index 0000000..4e55fcb
--- /dev/null
+++ b/backend/src/ocl_barrier.ll
@@ -0,0 +1,39 @@
+;XXX FIXME as llvm can't use macros, we hardcoded 3, 1, 2
+;here, we may need to use a more grace way to handle this type
+;of values latter.
+;#define CLK_LOCAL_MEM_FENCE  (1 << 0)
+;#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+declare i32 @_get_local_mem_fence() nounwind alwaysinline
+declare i32 @_get_global_mem_fence() nounwind alwaysinline
+declare void @__gen_ocl_barrier_local() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier_global() nounwind alwaysinline noduplicate
+declare void @__gen_ocl_barrier_local_and_global() nounwind alwaysinline noduplicate
+
+define void @barrier(i32 %flags) nounwind noduplicate alwaysinline {
+  %1 = icmp eq i32 %flags, 3
+  br i1 %1, label %barrier_local_global, label %barrier_local_check
+
+barrier_local_global:
+  call void @__gen_ocl_barrier_local_and_global()
+  br label %done
+
+barrier_local_check:
+  %2 = icmp eq i32 %flags, 1
+  br i1 %2, label %barrier_local, label %barrier_global_check
+
+barrier_local:
+  call void @__gen_ocl_barrier_local()
+  br label %done
+
+barrier_global_check:
+  %3 = icmp eq i32 %flags, 2
+  br i1 %3, label %barrier_global, label %done
+
+barrier_global:
+  call void @__gen_ocl_barrier_global()
+  br label %done
+
+done:
+  ret void
+}
diff --git a/backend/src/ocl_common_defines.h b/backend/src/ocl_common_defines.h
new file mode 100644
index 0000000..52f5365
--- /dev/null
+++ b/backend/src/ocl_common_defines.h
@@ -0,0 +1,126 @@
+// This file includes defines that are common to both kernel code and
+// the NVPTX back-end.
+#ifndef __OCL_COMMON_DEFINES__
+#define __OCL_COMMON_DEFINES__
+//
+// Common defines for Image intrinsics
+// Channel order
+#define CLK_HAS_ALPHA(color) (color == CLK_A || color == CLK_RA || color == CLK_RGBA || color == CLK_BGRA || color == CLK_ARGB)
+enum {
+  CLK_R = 0x10B0,
+  CLK_A = 0x10B1,
+  CLK_RG = 0x10B2,
+  CLK_RA = 0x10B3,
+  CLK_RGB = 0x10B4,
+  CLK_RGBA = 0x10B5,
+  CLK_BGRA = 0x10B6,
+  CLK_ARGB = 0x10B7,
+
+#if (__NV_CL_C_VERSION == __NV_CL_C_VERSION_1_0)
+  CLK_xRGB = 0x10B7,
+#endif
+
+  CLK_INTENSITY = 0x10B8,
+  CLK_LUMINANCE = 0x10B9
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+  ,
+  CLK_Rx = 0x10BA,
+  CLK_RGx = 0x10BB,
+  CLK_RGBx = 0x10BC
+#endif
+};
+
+
+typedef enum clk_channel_type {
+  // valid formats for float return types
+  CLK_SNORM_INT8 = 0x10D0,            // four channel RGBA unorm8
+  CLK_SNORM_INT16 = 0x10D1,           // four channel RGBA unorm16
+  CLK_UNORM_INT8 = 0x10D2,            // four channel RGBA unorm8
+  CLK_UNORM_INT16 = 0x10D3,           // four channel RGBA unorm16
+  CLK_HALF_FLOAT = 0x10DD,            // four channel RGBA half
+  CLK_FLOAT = 0x10DE,                 // four channel RGBA float
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+  CLK_UNORM_SHORT_565 = 0x10D4,
+  CLK_UNORM_SHORT_555 = 0x10D5,
+  CLK_UNORM_INT_101010 = 0x10D6,
+#endif
+
+  // valid only for integer return types
+  CLK_SIGNED_INT8 =  0x10D7,
+  CLK_SIGNED_INT16 = 0x10D8,
+  CLK_SIGNED_INT32 = 0x10D9,
+  CLK_UNSIGNED_INT8 = 0x10DA,
+  CLK_UNSIGNED_INT16 = 0x10DB,
+  CLK_UNSIGNED_INT32 = 0x10DC,
+
+  // CI SPI for CPU
+  __CLK_UNORM_INT8888 ,         // four channel ARGB unorm8
+  __CLK_UNORM_INT8888R,        // four channel BGRA unorm8
+
+  __CLK_VALID_IMAGE_TYPE_COUNT,
+  __CLK_INVALID_IMAGE_TYPE = __CLK_VALID_IMAGE_TYPE_COUNT,
+  __CLK_VALID_IMAGE_TYPE_MASK_BITS = 4,         // number of bits required to
+                                                // represent any image type
+  __CLK_VALID_IMAGE_TYPE_MASK = ( 1 << __CLK_VALID_IMAGE_TYPE_MASK_BITS ) - 1
+}clk_channel_type;
+
+typedef enum clk_sampler_type {
+    __CLK_ADDRESS_BASE             = 0,
+    CLK_ADDRESS_NONE               = (0 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_CLAMP              = (1 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_CLAMP_TO_EDGE      = (2 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_REPEAT             = (3 << __CLK_ADDRESS_BASE),
+    CLK_ADDRESS_MIRROR             = (4 << __CLK_ADDRESS_BASE),
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+    CLK_ADDRESS_MIRRORED_REPEAT    = CLK_ADDRESS_MIRROR,
+#endif
+    __CLK_ADDRESS_MASK             = (CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP |
+                                     CLK_ADDRESS_CLAMP_TO_EDGE |
+                                     CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR),
+    __CLK_ADDRESS_BITS             = 3,        // number of bits required to
+                                               // represent address info
+
+    __CLK_NORMALIZED_BASE          = __CLK_ADDRESS_BITS,
+    CLK_NORMALIZED_COORDS_FALSE    = 0,
+    CLK_NORMALIZED_COORDS_TRUE     = (1 << __CLK_NORMALIZED_BASE),
+    __CLK_NORMALIZED_MASK          = (CLK_NORMALIZED_COORDS_FALSE |
+                                      CLK_NORMALIZED_COORDS_TRUE),
+    __CLK_NORMALIZED_BITS          = 1,        // number of bits required to
+                                               // represent normalization
+    __CLK_FILTER_BASE              = (__CLK_NORMALIZED_BASE +  __CLK_NORMALIZED_BITS),
+    CLK_FILTER_NEAREST             = (0 << __CLK_FILTER_BASE),
+    CLK_FILTER_LINEAR              = (1 << __CLK_FILTER_BASE),
+    CLK_FILTER_ANISOTROPIC         = (2 << __CLK_FILTER_BASE),
+    __CLK_FILTER_MASK              = (CLK_FILTER_NEAREST | CLK_FILTER_LINEAR |
+                                     CLK_FILTER_ANISOTROPIC),
+    __CLK_FILTER_BITS              = 2,        // number of bits required to
+                                               // represent address info
+
+    __CLK_MIP_BASE                 = (__CLK_FILTER_BASE + __CLK_FILTER_BITS),
+    CLK_MIP_NEAREST                = (0 << __CLK_MIP_BASE),
+    CLK_MIP_LINEAR                 = (1 << __CLK_MIP_BASE),
+    CLK_MIP_ANISOTROPIC            = (2 << __CLK_MIP_BASE),
+    __CLK_MIP_MASK                 = (CLK_MIP_NEAREST | CLK_MIP_LINEAR |
+                                     CLK_MIP_ANISOTROPIC),
+    __CLK_MIP_BITS                 = 2,
+
+    __CLK_SAMPLER_BITS             = (__CLK_MIP_BASE + __CLK_MIP_BITS),
+    __CLK_SAMPLER_MASK             = (__CLK_MIP_MASK | __CLK_FILTER_MASK |
+                                      __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK),
+
+    __CLK_SAMPLER_ARG_BASE         = (__CLK_MIP_BASE + __CLK_SAMPLER_BITS),
+    __CLK_SAMPLER_ARG_BITS         = 8,
+    __CLK_SAMPLER_ARG_MASK         = (((1 << __CLK_SAMPLER_ARG_BITS) - 1) << __CLK_SAMPLER_ARG_BASE),
+    __CLK_SAMPLER_ARG_KEY_BIT      = (1 << (__CLK_SAMPLER_ARG_BASE + __CLK_SAMPLER_ARG_BITS)),
+    __CLK_SAMPLER_ARG_KEY_BITS     = 1,
+
+} clk_sampler_type;
+
+// Memory synchronization
+#define CLK_LOCAL_MEM_FENCE     (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE    (1 << 1)
+
+#endif   /* __OCL_COMMON_DEFINES__ */
\ No newline at end of file
diff --git a/backend/src/ocl_convert.h b/backend/src/ocl_convert.h
new file mode 100644
index 0000000..8326768
--- /dev/null
+++ b/backend/src/ocl_convert.h
@@ -0,0 +1,17415 @@
+// This file is autogenerated by gen_convert.sh.
+// Don't modify it manually.
+INLINE OVERLOADABLE long convert_long(long v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(long v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(long v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(long v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(long v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(long v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(long v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(long v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(long v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(long v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(ulong v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(ulong v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(ulong v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(ulong v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(ulong v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(ulong v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(ulong v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(ulong v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(ulong v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(ulong v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(int v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(int v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(int v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(int v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(int v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(int v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(int v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(int v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(int v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(int v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(uint v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(uint v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(uint v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(uint v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(uint v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(uint v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(uint v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(uint v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(uint v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(uint v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(short v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(short v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(short v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(short v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(short v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(short v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(short v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(short v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(short v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(short v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(ushort v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(ushort v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(ushort v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(ushort v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(ushort v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(ushort v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(ushort v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(ushort v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(ushort v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(ushort v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(char v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(char v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(char v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(char v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(char v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(char v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(char v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(char v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(char v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(char v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(uchar v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(uchar v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(uchar v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(uchar v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(uchar v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(uchar v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(uchar v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(uchar v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(uchar v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(uchar v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(double v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(double v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(double v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(double v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(double v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(double v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(double v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(double v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(double v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(double v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long convert_long(float v) {
+  return (long)v;
+}
+
+INLINE OVERLOADABLE ulong convert_ulong(float v) {
+  return (ulong)v;
+}
+
+INLINE OVERLOADABLE int convert_int(float v) {
+  return (int)v;
+}
+
+INLINE OVERLOADABLE uint convert_uint(float v) {
+  return (uint)v;
+}
+
+INLINE OVERLOADABLE short convert_short(float v) {
+  return (short)v;
+}
+
+INLINE OVERLOADABLE ushort convert_ushort(float v) {
+  return (ushort)v;
+}
+
+INLINE OVERLOADABLE char convert_char(float v) {
+  return (char)v;
+}
+
+INLINE OVERLOADABLE uchar convert_uchar(float v) {
+  return (uchar)v;
+}
+
+INLINE OVERLOADABLE double convert_double(float v) {
+  return (double)v;
+}
+
+INLINE OVERLOADABLE float convert_float(float v) {
+  return (float)v;
+}
+
+INLINE OVERLOADABLE long2 convert_long2(long2 v) { return v; }
+INLINE OVERLOADABLE ulong2 convert_ulong2(long2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(long2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(long2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(long2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(long2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(long2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(long2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(long2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(long2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(ulong2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(ulong2 v) { return v; }
+INLINE OVERLOADABLE int2 convert_int2(ulong2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(ulong2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(ulong2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(ulong2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(ulong2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(ulong2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(ulong2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(ulong2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(int2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(int2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(int2 v) { return v; }
+INLINE OVERLOADABLE uint2 convert_uint2(int2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(int2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(int2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(int2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(int2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(int2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(int2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(uint2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(uint2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(uint2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(uint2 v) { return v; }
+INLINE OVERLOADABLE short2 convert_short2(uint2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(uint2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(uint2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(uint2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(uint2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(uint2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(short2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(short2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(short2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(short2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(short2 v) { return v; }
+INLINE OVERLOADABLE ushort2 convert_ushort2(short2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(short2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(short2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(short2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(short2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(ushort2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(ushort2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(ushort2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(ushort2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(ushort2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(ushort2 v) { return v; }
+INLINE OVERLOADABLE char2 convert_char2(ushort2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(ushort2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(ushort2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(ushort2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(char2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(char2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(char2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(char2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(char2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(char2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(char2 v) { return v; }
+INLINE OVERLOADABLE uchar2 convert_uchar2(char2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(char2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(char2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(uchar2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(uchar2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(uchar2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(uchar2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(uchar2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(uchar2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(uchar2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(uchar2 v) { return v; }
+INLINE OVERLOADABLE double2 convert_double2(uchar2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(uchar2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(double2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(double2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(double2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(double2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(double2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(double2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(double2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(double2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(double2 v) { return v; }
+INLINE OVERLOADABLE float2 convert_float2(double2 v) {
+  return (float2)((float)(v.s0), (float)(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2(float2 v) {
+  return (long2)((long)(v.s0), (long)(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2(float2 v) {
+  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2(float2 v) {
+  return (int2)((int)(v.s0), (int)(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2(float2 v) {
+  return (uint2)((uint)(v.s0), (uint)(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2(float2 v) {
+  return (short2)((short)(v.s0), (short)(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2(float2 v) {
+  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2(float2 v) {
+  return (char2)((char)(v.s0), (char)(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2(float2 v) {
+  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
+}
+
+INLINE OVERLOADABLE double2 convert_double2(float2 v) {
+  return (double2)((double)(v.s0), (double)(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2(float2 v) { return v; }
+INLINE OVERLOADABLE long3 convert_long3(long3 v) { return v; }
+INLINE OVERLOADABLE ulong3 convert_ulong3(long3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(long3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(long3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(long3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(long3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(long3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(long3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(long3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(long3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(ulong3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(ulong3 v) { return v; }
+INLINE OVERLOADABLE int3 convert_int3(ulong3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(ulong3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(ulong3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(ulong3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(ulong3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(ulong3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(ulong3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(ulong3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(int3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(int3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(int3 v) { return v; }
+INLINE OVERLOADABLE uint3 convert_uint3(int3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(int3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(int3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(int3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(int3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(int3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(int3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(uint3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(uint3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(uint3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(uint3 v) { return v; }
+INLINE OVERLOADABLE short3 convert_short3(uint3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(uint3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(uint3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(uint3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(uint3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(uint3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(short3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(short3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(short3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(short3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(short3 v) { return v; }
+INLINE OVERLOADABLE ushort3 convert_ushort3(short3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(short3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(short3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(short3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(short3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(ushort3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(ushort3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(ushort3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(ushort3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(ushort3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(ushort3 v) { return v; }
+INLINE OVERLOADABLE char3 convert_char3(ushort3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(ushort3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(ushort3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(ushort3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(char3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(char3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(char3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(char3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(char3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(char3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(char3 v) { return v; }
+INLINE OVERLOADABLE uchar3 convert_uchar3(char3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(char3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(char3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(uchar3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(uchar3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(uchar3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(uchar3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(uchar3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(uchar3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(uchar3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(uchar3 v) { return v; }
+INLINE OVERLOADABLE double3 convert_double3(uchar3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(uchar3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(double3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(double3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(double3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(double3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(double3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(double3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(double3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(double3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(double3 v) { return v; }
+INLINE OVERLOADABLE float3 convert_float3(double3 v) {
+  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3(float3 v) {
+  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3(float3 v) {
+  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3(float3 v) {
+  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3(float3 v) {
+  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3(float3 v) {
+  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3(float3 v) {
+  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3(float3 v) {
+  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3(float3 v) {
+  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
+}
+
+INLINE OVERLOADABLE double3 convert_double3(float3 v) {
+  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3(float3 v) { return v; }
+INLINE OVERLOADABLE long4 convert_long4(long4 v) { return v; }
+INLINE OVERLOADABLE ulong4 convert_ulong4(long4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(long4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(long4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(long4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(long4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(long4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(long4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(long4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(long4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(ulong4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(ulong4 v) { return v; }
+INLINE OVERLOADABLE int4 convert_int4(ulong4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(ulong4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(ulong4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(ulong4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(ulong4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(ulong4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(ulong4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(ulong4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(int4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(int4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(int4 v) { return v; }
+INLINE OVERLOADABLE uint4 convert_uint4(int4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(int4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(int4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(int4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(int4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(int4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(int4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(uint4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(uint4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(uint4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(uint4 v) { return v; }
+INLINE OVERLOADABLE short4 convert_short4(uint4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(uint4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(uint4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(uint4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(uint4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(uint4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(short4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(short4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(short4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(short4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(short4 v) { return v; }
+INLINE OVERLOADABLE ushort4 convert_ushort4(short4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(short4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(short4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(short4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(short4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(ushort4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(ushort4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(ushort4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(ushort4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(ushort4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(ushort4 v) { return v; }
+INLINE OVERLOADABLE char4 convert_char4(ushort4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(ushort4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(ushort4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(ushort4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(char4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(char4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(char4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(char4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(char4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(char4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(char4 v) { return v; }
+INLINE OVERLOADABLE uchar4 convert_uchar4(char4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(char4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(char4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(uchar4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(uchar4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(uchar4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(uchar4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(uchar4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(uchar4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(uchar4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(uchar4 v) { return v; }
+INLINE OVERLOADABLE double4 convert_double4(uchar4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(uchar4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(double4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(double4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(double4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(double4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(double4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(double4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(double4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(double4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(double4 v) { return v; }
+INLINE OVERLOADABLE float4 convert_float4(double4 v) {
+  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4(float4 v) {
+  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4(float4 v) {
+  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4(float4 v) {
+  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4(float4 v) {
+  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4(float4 v) {
+  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4(float4 v) {
+  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4(float4 v) {
+  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4(float4 v) {
+  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
+}
+
+INLINE OVERLOADABLE double4 convert_double4(float4 v) {
+  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4(float4 v) { return v; }
+INLINE OVERLOADABLE long8 convert_long8(long8 v) { return v; }
+INLINE OVERLOADABLE ulong8 convert_ulong8(long8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(long8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(long8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(long8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(long8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(long8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(long8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(long8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(long8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(ulong8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(ulong8 v) { return v; }
+INLINE OVERLOADABLE int8 convert_int8(ulong8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(ulong8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(ulong8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(ulong8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(ulong8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(ulong8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(ulong8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(ulong8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(int8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(int8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(int8 v) { return v; }
+INLINE OVERLOADABLE uint8 convert_uint8(int8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(int8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(int8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(int8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(int8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(int8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(int8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(uint8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(uint8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(uint8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(uint8 v) { return v; }
+INLINE OVERLOADABLE short8 convert_short8(uint8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(uint8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(uint8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(uint8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(uint8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(uint8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(short8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(short8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(short8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(short8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(short8 v) { return v; }
+INLINE OVERLOADABLE ushort8 convert_ushort8(short8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(short8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(short8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(short8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(short8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(ushort8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(ushort8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(ushort8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(ushort8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(ushort8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(ushort8 v) { return v; }
+INLINE OVERLOADABLE char8 convert_char8(ushort8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(ushort8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(ushort8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(ushort8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(char8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(char8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(char8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(char8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(char8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(char8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(char8 v) { return v; }
+INLINE OVERLOADABLE uchar8 convert_uchar8(char8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(char8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(char8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(uchar8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(uchar8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(uchar8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(uchar8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(uchar8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(uchar8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(uchar8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(uchar8 v) { return v; }
+INLINE OVERLOADABLE double8 convert_double8(uchar8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(uchar8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(double8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(double8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(double8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(double8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(double8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(double8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(double8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(double8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(double8 v) { return v; }
+INLINE OVERLOADABLE float8 convert_float8(double8 v) {
+  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8(float8 v) {
+  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8(float8 v) {
+  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8(float8 v) {
+  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8(float8 v) {
+  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8(float8 v) {
+  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8(float8 v) {
+  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8(float8 v) {
+  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8(float8 v) {
+  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
+}
+
+INLINE OVERLOADABLE double8 convert_double8(float8 v) {
+  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8(float8 v) { return v; }
+INLINE OVERLOADABLE long16 convert_long16(long16 v) { return v; }
+INLINE OVERLOADABLE ulong16 convert_ulong16(long16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(long16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(long16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(long16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(long16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(long16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(long16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(long16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(long16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(ulong16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(ulong16 v) { return v; }
+INLINE OVERLOADABLE int16 convert_int16(ulong16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(ulong16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(ulong16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(ulong16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(ulong16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(ulong16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(ulong16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(ulong16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(int16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(int16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(int16 v) { return v; }
+INLINE OVERLOADABLE uint16 convert_uint16(int16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(int16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(int16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(int16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(int16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(int16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(int16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(uint16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(uint16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(uint16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(uint16 v) { return v; }
+INLINE OVERLOADABLE short16 convert_short16(uint16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(uint16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(uint16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(uint16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(uint16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(uint16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(short16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(short16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(short16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(short16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(short16 v) { return v; }
+INLINE OVERLOADABLE ushort16 convert_ushort16(short16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(short16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(short16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(short16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(short16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(ushort16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(ushort16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(ushort16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(ushort16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(ushort16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(ushort16 v) { return v; }
+INLINE OVERLOADABLE char16 convert_char16(ushort16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(ushort16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(ushort16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(ushort16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(char16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(char16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(char16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(char16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(char16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(char16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(char16 v) { return v; }
+INLINE OVERLOADABLE uchar16 convert_uchar16(char16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(char16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(char16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(uchar16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(uchar16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(uchar16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(uchar16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(uchar16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(uchar16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(uchar16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(uchar16 v) { return v; }
+INLINE OVERLOADABLE double16 convert_double16(uchar16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(uchar16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(double16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(double16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(double16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(double16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(double16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(double16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(double16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(double16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(double16 v) { return v; }
+INLINE OVERLOADABLE float16 convert_float16(double16 v) {
+  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16(float16 v) {
+  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16(float16 v) {
+  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16(float16 v) {
+  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16(float16 v) {
+  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16(float16 v) {
+  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16(float16 v) {
+  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16(float16 v) {
+  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16(float16 v) {
+  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
+}
+
+INLINE OVERLOADABLE double16 convert_double16(float16 v) {
+  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16(float16 v) { return v; }
+
+#define DEF(DSTTYPE, SRCTYPE) \
+  OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);
+DEF(char, uchar);
+DEF(char, short);
+DEF(char, ushort);
+DEF(char, int);
+DEF(char, uint);
+DEF(char, float);
+DEF(uchar, char);
+DEF(uchar, short);
+DEF(uchar, ushort);
+DEF(uchar, int);
+DEF(uchar, uint);
+DEF(uchar, float);
+DEF(short, ushort);
+DEF(short, int);
+DEF(short, uint);
+DEF(short, float);
+DEF(ushort, short);
+DEF(ushort, int);
+DEF(ushort, uint);
+DEF(ushort, float);
+DEF(int, uint);
+DEF(int, float);
+DEF(uint, int);
+DEF(uint, float);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE, MIN, MAX) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x >= MAX ? (DSTTYPE)MAX : x <= MIN ? (DSTTYPE)MIN : x; \
+  }
+DEF(char, long, -128, 127);
+DEF(uchar, long, 0, 255);
+DEF(short, long, -32768, 32767);
+DEF(ushort, long, 0, 65535);
+DEF(int, long, -0x7fffffff-1, 0x7fffffff);
+DEF(uint, long, 0, 0xffffffffu);
+DEF(long, float, -9.223372036854776e+18f, 9.223372036854776e+18f);
+DEF(ulong, float, 0, 1.8446744073709552e+19f);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE, MAX) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x >= MAX ? (DSTTYPE)MAX : x; \
+  }
+DEF(char, ulong, 127);
+DEF(uchar, ulong, 255);
+DEF(short, ulong, 32767);
+DEF(ushort, ulong, 65535);
+DEF(int, ulong, 0x7fffffff);
+DEF(uint, ulong, 0xffffffffu);
+#undef DEF
+
+INLINE_OVERLOADABLE long convert_long_sat(ulong x) {
+  ulong MAX = 0x7ffffffffffffffful;
+  return x >= MAX ? MAX : x;
+}
+
+#define DEF(DSTTYPE, SRCTYPE) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x <= 0 ? 0 : x; \
+  }
+DEF(ushort, char);
+DEF(uint, char);
+DEF(uint, short);
+DEF(ulong, char);
+DEF(ulong, short);
+DEF(ulong, int);
+DEF(ulong, long);
+#undef DEF
+
+#define DEF(DSTTYPE, SRCTYPE) \
+  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+    return x; \
+  }
+DEF(char, char);
+DEF(uchar, uchar);
+DEF(short, char);
+DEF(short, uchar);
+DEF(short, short);
+DEF(ushort, uchar);
+DEF(ushort, ushort);
+DEF(int, char);
+DEF(int, uchar);
+DEF(int, short);
+DEF(int, ushort);
+DEF(int, int);
+DEF(uint, uchar);
+DEF(uint, ushort);
+DEF(uint, uint);
+DEF(long, char);
+DEF(long, uchar);
+DEF(long, short);
+DEF(long, ushort);
+DEF(long, int);
+DEF(long, uint);
+DEF(long, long);
+DEF(ulong, uchar);
+DEF(ulong, ushort);
+DEF(ulong, uint);
+DEF(ulong, ulong);
+#undef DEF
+
+INLINE OVERLOADABLE long2 convert_long2_sat(long2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(long2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(long2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(long2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(long2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(long2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(long2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(long2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(ulong2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(ulong2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(ulong2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(ulong2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(ulong2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(ulong2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(ulong2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(ulong2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(int2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(int2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(int2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(int2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(int2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(int2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(int2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(int2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(uint2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(uint2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(uint2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(uint2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(uint2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(uint2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(uint2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(uint2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(short2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(short2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(short2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(short2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(short2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(short2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(short2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(short2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(ushort2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(ushort2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(ushort2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(ushort2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(ushort2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(ushort2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(ushort2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(ushort2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(char2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(char2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(char2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(char2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(char2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(char2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(char2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(char2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(uchar2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(uchar2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(uchar2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(uchar2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(uchar2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(uchar2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(uchar2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(uchar2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat(float2 v) {
+  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat(float2 v) {
+  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat(float2 v) {
+  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat(float2 v) {
+  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat(float2 v) {
+  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat(float2 v) {
+  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat(float2 v) {
+  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat(float2 v) {
+  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(long3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(long3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(long3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(long3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(long3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(long3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(long3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(long3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(ulong3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(ulong3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(ulong3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(ulong3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(ulong3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(ulong3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(ulong3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(ulong3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(int3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(int3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(int3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(int3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(int3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(int3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(int3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(int3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(uint3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(uint3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(uint3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(uint3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(uint3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(uint3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(uint3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(uint3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(short3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(short3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(short3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(short3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(short3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(short3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(short3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(short3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(ushort3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(ushort3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(ushort3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(ushort3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(ushort3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(ushort3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(ushort3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(ushort3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(char3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(char3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(char3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(char3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(char3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(char3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(char3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(char3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(uchar3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(uchar3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(uchar3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(uchar3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(uchar3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(uchar3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(uchar3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(uchar3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat(float3 v) {
+  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat(float3 v) {
+  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat(float3 v) {
+  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat(float3 v) {
+  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat(float3 v) {
+  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat(float3 v) {
+  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat(float3 v) {
+  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat(float3 v) {
+  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(long4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(long4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(long4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(long4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(long4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(long4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(long4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(long4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(ulong4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(ulong4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(ulong4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(ulong4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(ulong4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(ulong4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(ulong4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(ulong4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(int4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(int4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(int4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(int4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(int4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(int4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(int4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(int4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(uint4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(uint4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(uint4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(uint4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(uint4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(uint4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(uint4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(uint4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(short4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(short4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(short4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(short4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(short4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(short4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(short4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(short4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(ushort4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(ushort4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(ushort4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(ushort4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(ushort4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(ushort4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(ushort4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(ushort4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(char4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(char4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(char4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(char4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(char4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(char4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(char4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(char4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(uchar4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(uchar4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(uchar4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(uchar4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(uchar4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(uchar4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(uchar4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(uchar4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat(float4 v) {
+  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat(float4 v) {
+  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat(float4 v) {
+  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat(float4 v) {
+  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat(float4 v) {
+  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat(float4 v) {
+  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat(float4 v) {
+  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat(float4 v) {
+  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(long8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(long8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(long8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(long8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(long8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(long8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(long8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(long8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(ulong8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(ulong8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(ulong8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(ulong8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(ulong8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(ulong8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(ulong8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(ulong8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(int8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(int8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(int8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(int8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(int8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(int8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(int8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(int8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(uint8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(uint8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(uint8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(uint8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(uint8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(uint8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(uint8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(uint8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(short8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(short8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(short8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(short8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(short8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(short8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(short8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(short8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(ushort8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(ushort8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(ushort8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(ushort8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(ushort8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(ushort8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(ushort8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(ushort8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(char8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(char8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(char8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(char8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(char8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(char8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(char8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(char8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(uchar8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(uchar8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(uchar8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(uchar8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(uchar8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(uchar8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(uchar8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(uchar8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat(float8 v) {
+  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat(float8 v) {
+  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat(float8 v) {
+  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat(float8 v) {
+  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat(float8 v) {
+  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat(float8 v) {
+  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat(float8 v) {
+  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat(float8 v) {
+  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(long16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(long16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(long16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(long16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(long16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(long16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(long16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(long16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(ulong16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(ulong16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(ulong16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(ulong16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(ulong16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(ulong16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(ulong16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(ulong16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(int16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(int16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(int16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(int16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(int16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(int16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(int16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(int16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(uint16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(uint16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(uint16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(uint16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(uint16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(uint16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(uint16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(uint16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(short16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(short16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(short16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(short16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(short16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(short16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(short16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(short16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(ushort16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(ushort16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(ushort16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(ushort16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(ushort16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(ushort16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(ushort16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(ushort16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(char16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(char16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(char16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(char16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(char16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(char16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(char16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(char16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(uchar16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(uchar16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(uchar16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(uchar16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(uchar16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(uchar16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(uchar16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(uchar16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat(float16 v) {
+  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat(float16 v) {
+  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat(float16 v) {
+  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat(float16 v) {
+  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat(float16 v) {
+  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat(float16 v) {
+  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat(float16 v) {
+  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat(float16 v) {
+  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
+}
+
+
+float __gen_ocl_rndz(float x);
+float __gen_ocl_rnde(float x);
+float __gen_ocl_rndu(float x);
+float __gen_ocl_rndd(float x);
+INLINE_OVERLOADABLE float __convert_float_rtz(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;
+  if((l > x && x > 0) || x >= 0x7fffffc000000000 ||
+     (l < x && x < 0)) {
+      u.u -= 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;  //can not use u.f < x
+  if(l < x && x < 0x7fffffc000000000) {
+    if(x > 0)
+      u.u = u.u + 1;
+    else
+      u.u = u.u - 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;  //avoid overflow
+  if(l > x || x >= 0x7fffffc000000000) {
+    if(x > 0)
+      u.u = u.u - 1;
+    else
+      u.u = u.u + 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(ulong x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong l = u.f;
+  if(l > x  || x >= 0xffffff8000000000)
+      u.u -= 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(ulong x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong l = u.f;  //can not use u.f < x
+  if(l < x && x < 0xffffff8000000000)
+    u.u = u.u + 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(ulong x)
+{
+  return __convert_float_rtz(x);
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long i = u.f;
+  if((i > x && x > 0) ||
+     (i < x && x < 0)) {
+      u.u -= 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  int i = u.f;
+  if(i < x) {
+    if(x > 0)
+      u.u += 1;
+    else
+      u.u -= 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long i = u.f;  //avoid overflow
+  if(i > x) {
+    if(x > 0)
+      u.u = u.u - 1;
+    else
+      u.u = u.u + 1;
+  }
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtz(uint x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong i = u.f;
+  if(i > x)
+    u.u -= 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtp(uint x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  uint i = u.f;
+  if(i < x)
+    u.u += 1;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __convert_float_rtn(uint x)
+{
+  return __convert_float_rtz(x);
+}
+
+INLINE_OVERLOADABLE long convert_long_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(long x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(long x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(long x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(long x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(long x)
+{ return __convert_float_rtz(x); }
+INLINE_OVERLOADABLE float convert_float_rtp(long x)
+{ return __convert_float_rtp(x); }
+INLINE_OVERLOADABLE float convert_float_rtn(long x)
+{ return __convert_float_rtn(x); }
+INLINE_OVERLOADABLE long convert_long_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(ulong x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(ulong x)
+{ return __convert_float_rtz(x); }
+INLINE_OVERLOADABLE float convert_float_rtp(ulong x)
+{ return __convert_float_rtp(x); }
+INLINE_OVERLOADABLE float convert_float_rtn(ulong x)
+{ return __convert_float_rtn(x); }
+INLINE_OVERLOADABLE long convert_long_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(int x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(int x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(int x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(int x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(int x)
+{ return __convert_float_rtz(x); }
+INLINE_OVERLOADABLE float convert_float_rtp(int x)
+{ return __convert_float_rtp(x); }
+INLINE_OVERLOADABLE float convert_float_rtn(int x)
+{ return __convert_float_rtn(x); }
+INLINE_OVERLOADABLE long convert_long_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(uint x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(uint x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(uint x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(uint x)
+{ return __convert_float_rtz(x); }
+INLINE_OVERLOADABLE float convert_float_rtp(uint x)
+{ return __convert_float_rtp(x); }
+INLINE_OVERLOADABLE float convert_float_rtn(uint x)
+{ return __convert_float_rtn(x); }
+INLINE_OVERLOADABLE long convert_long_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(short x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(short x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(short x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(short x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(ushort x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(char x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(char x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(char x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(char x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE int convert_int_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uint convert_uint_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE short convert_short_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE char convert_char_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rte(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(uchar x)
+{ return x; }
+INLINE_OVERLOADABLE long convert_long_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE long convert_long_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE long convert_long_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE long convert_long_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE int convert_int_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE int convert_int_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE int convert_int_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE int convert_int_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE uint convert_uint_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE uint convert_uint_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE uint convert_uint_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE uint convert_uint_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE short convert_short_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE short convert_short_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE short convert_short_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE short convert_short_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE char convert_char_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE char convert_char_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE char convert_char_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE char convert_char_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_rte(float x)
+{ return __gen_ocl_rnde(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_rtz(float x)
+{ return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_rtp(float x)
+{ return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_rtn(float x)
+{ return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE float convert_float_rte(float x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtz(float x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtp(float x)
+{ return x; }
+INLINE_OVERLOADABLE float convert_float_rtn(float x)
+{ return x; }
+INLINE OVERLOADABLE long2 convert_long2_rte(long2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(long2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(long2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(long2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(long2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(long2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(long2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(long2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(long2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(long2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(long2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(long2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(long2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(long2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(long2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(long2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(long2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(long2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(long2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(long2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(long2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(long2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(long2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(long2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(long2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(long2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(long2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(long2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(long2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(long2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(long2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(long2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(long2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(long2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(long2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(long2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(ulong2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(ulong2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(ulong2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(ulong2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(ulong2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(ulong2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(ulong2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(ulong2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(ulong2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(ulong2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(ulong2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(ulong2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(ulong2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(ulong2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(ulong2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(ulong2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(ulong2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(ulong2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(ulong2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(ulong2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(ulong2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(ulong2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(ulong2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(ulong2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(ulong2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(ulong2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(ulong2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(ulong2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(ulong2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(ulong2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(ulong2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(ulong2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(ulong2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(ulong2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(ulong2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(ulong2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(int2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(int2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(int2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(int2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(int2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(int2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(int2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(int2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(int2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(int2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(int2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(int2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(int2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(int2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(int2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(int2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(int2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(int2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(int2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(int2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(int2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(int2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(int2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(int2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(int2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(int2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(int2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(int2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(int2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(int2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(int2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(int2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(int2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(int2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(int2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(int2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(uint2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(uint2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(uint2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(uint2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(uint2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(uint2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(uint2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(uint2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(uint2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(uint2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(uint2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(uint2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(uint2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(uint2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(uint2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(uint2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(uint2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(uint2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(uint2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(uint2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(uint2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(uint2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(uint2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(uint2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(uint2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(uint2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(uint2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(uint2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(uint2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(uint2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(uint2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(uint2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(uint2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(uint2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(uint2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(uint2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(short2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(short2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(short2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(short2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(short2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(short2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(short2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(short2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(short2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(short2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(short2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(short2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(short2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(short2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(short2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(short2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(short2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(short2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(short2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(short2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(short2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(short2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(short2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(short2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(short2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(short2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(short2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(short2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(short2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(short2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(short2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(short2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(short2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(short2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(short2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(short2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(ushort2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(ushort2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(ushort2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(ushort2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(ushort2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(ushort2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(ushort2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(ushort2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(ushort2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(ushort2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(ushort2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(ushort2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(ushort2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(ushort2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(ushort2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(ushort2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(ushort2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(ushort2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(ushort2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(ushort2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(ushort2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(ushort2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(ushort2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(ushort2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(ushort2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(ushort2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(ushort2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(ushort2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(ushort2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(ushort2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(ushort2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(ushort2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(ushort2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(ushort2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(ushort2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(ushort2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(char2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(char2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(char2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(char2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(char2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(char2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(char2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(char2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(char2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(char2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(char2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(char2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(char2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(char2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(char2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(char2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(char2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(char2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(char2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(char2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(char2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(char2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(char2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(char2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(char2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(char2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(char2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(char2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(char2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(char2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(char2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(char2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(char2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(char2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(char2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(char2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(uchar2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(uchar2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(uchar2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(uchar2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(uchar2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(uchar2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(uchar2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(uchar2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(uchar2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(uchar2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(uchar2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(uchar2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(uchar2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(uchar2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(uchar2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(uchar2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(uchar2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(uchar2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(uchar2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(uchar2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(uchar2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(uchar2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(uchar2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(uchar2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(uchar2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(uchar2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(uchar2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(uchar2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(uchar2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(uchar2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(uchar2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(uchar2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(uchar2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(uchar2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(uchar2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(uchar2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rte(float2 v) {
+  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtz(float2 v) {
+  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtp(float2 v) {
+  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_rtn(float2 v) {
+  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rte(float2 v) {
+  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(float2 v) {
+  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(float2 v) {
+  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(float2 v) {
+  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rte(float2 v) {
+  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtz(float2 v) {
+  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtp(float2 v) {
+  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_rtn(float2 v) {
+  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rte(float2 v) {
+  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtz(float2 v) {
+  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtp(float2 v) {
+  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_rtn(float2 v) {
+  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rte(float2 v) {
+  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtz(float2 v) {
+  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtp(float2 v) {
+  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_rtn(float2 v) {
+  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rte(float2 v) {
+  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(float2 v) {
+  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(float2 v) {
+  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(float2 v) {
+  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rte(float2 v) {
+  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtz(float2 v) {
+  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtp(float2 v) {
+  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_rtn(float2 v) {
+  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rte(float2 v) {
+  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(float2 v) {
+  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(float2 v) {
+  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(float2 v) {
+  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rte(float2 v) {
+  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtz(float2 v) {
+  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtp(float2 v) {
+  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE float2 convert_float2_rtn(float2 v) {
+  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(long3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(long3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(long3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(long3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(long3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(long3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(long3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(long3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(long3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(long3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(long3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(long3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(long3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(long3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(long3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(long3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(long3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(long3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(long3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(long3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(long3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(long3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(long3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(long3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(long3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(long3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(long3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(long3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(long3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(long3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(long3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(long3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(long3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(long3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(long3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(long3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(ulong3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(ulong3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(ulong3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(ulong3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(ulong3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(ulong3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(ulong3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(ulong3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(ulong3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(ulong3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(ulong3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(ulong3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(ulong3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(ulong3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(ulong3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(ulong3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(ulong3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(ulong3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(ulong3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(ulong3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(ulong3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(ulong3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(ulong3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(ulong3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(ulong3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(ulong3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(ulong3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(ulong3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(ulong3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(ulong3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(ulong3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(ulong3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(ulong3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(ulong3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(ulong3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(ulong3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(int3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(int3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(int3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(int3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(int3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(int3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(int3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(int3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(int3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(int3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(int3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(int3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(int3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(int3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(int3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(int3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(int3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(int3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(int3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(int3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(int3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(int3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(int3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(int3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(int3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(int3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(int3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(int3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(int3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(int3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(int3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(int3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(int3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(int3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(int3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(int3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(uint3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(uint3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(uint3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(uint3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(uint3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(uint3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(uint3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(uint3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(uint3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(uint3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(uint3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(uint3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(uint3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(uint3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(uint3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(uint3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(uint3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(uint3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(uint3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(uint3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(uint3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(uint3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(uint3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(uint3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(uint3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(uint3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(uint3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(uint3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(uint3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(uint3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(uint3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(uint3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(uint3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(uint3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(uint3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(uint3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(short3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(short3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(short3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(short3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(short3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(short3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(short3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(short3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(short3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(short3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(short3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(short3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(short3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(short3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(short3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(short3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(short3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(short3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(short3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(short3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(short3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(short3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(short3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(short3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(short3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(short3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(short3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(short3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(short3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(short3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(short3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(short3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(short3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(short3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(short3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(short3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(ushort3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(ushort3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(ushort3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(ushort3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(ushort3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(ushort3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(ushort3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(ushort3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(ushort3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(ushort3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(ushort3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(ushort3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(ushort3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(ushort3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(ushort3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(ushort3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(ushort3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(ushort3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(ushort3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(ushort3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(ushort3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(ushort3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(ushort3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(ushort3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(ushort3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(ushort3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(ushort3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(ushort3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(ushort3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(ushort3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(ushort3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(ushort3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(ushort3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(ushort3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(ushort3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(ushort3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(char3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(char3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(char3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(char3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(char3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(char3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(char3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(char3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(char3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(char3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(char3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(char3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(char3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(char3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(char3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(char3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(char3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(char3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(char3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(char3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(char3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(char3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(char3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(char3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(char3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(char3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(char3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(char3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(char3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(char3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(char3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(char3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(char3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(char3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(char3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(char3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(uchar3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(uchar3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(uchar3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(uchar3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(uchar3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(uchar3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(uchar3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(uchar3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(uchar3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(uchar3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(uchar3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(uchar3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(uchar3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(uchar3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(uchar3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(uchar3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(uchar3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(uchar3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(uchar3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(uchar3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(uchar3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(uchar3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(uchar3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(uchar3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(uchar3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(uchar3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(uchar3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(uchar3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(uchar3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(uchar3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(uchar3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(uchar3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(uchar3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(uchar3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(uchar3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(uchar3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rte(float3 v) {
+  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtz(float3 v) {
+  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtp(float3 v) {
+  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_rtn(float3 v) {
+  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rte(float3 v) {
+  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(float3 v) {
+  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(float3 v) {
+  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(float3 v) {
+  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rte(float3 v) {
+  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtz(float3 v) {
+  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtp(float3 v) {
+  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_rtn(float3 v) {
+  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rte(float3 v) {
+  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtz(float3 v) {
+  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtp(float3 v) {
+  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_rtn(float3 v) {
+  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rte(float3 v) {
+  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtz(float3 v) {
+  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtp(float3 v) {
+  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_rtn(float3 v) {
+  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rte(float3 v) {
+  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(float3 v) {
+  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(float3 v) {
+  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(float3 v) {
+  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rte(float3 v) {
+  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtz(float3 v) {
+  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtp(float3 v) {
+  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_rtn(float3 v) {
+  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rte(float3 v) {
+  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(float3 v) {
+  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(float3 v) {
+  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(float3 v) {
+  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rte(float3 v) {
+  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtz(float3 v) {
+  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtp(float3 v) {
+  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE float3 convert_float3_rtn(float3 v) {
+  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(long4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(long4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(long4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(long4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(long4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(long4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(long4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(long4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(long4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(long4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(long4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(long4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(long4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(long4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(long4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(long4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(long4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(long4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(long4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(long4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(long4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(long4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(long4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(long4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(long4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(long4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(long4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(long4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(long4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(long4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(long4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(long4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(long4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(long4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(long4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(long4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(ulong4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(ulong4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(ulong4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(ulong4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(ulong4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(ulong4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(ulong4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(ulong4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(ulong4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(ulong4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(ulong4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(ulong4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(ulong4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(ulong4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(ulong4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(ulong4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(ulong4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(ulong4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(ulong4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(ulong4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(ulong4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(ulong4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(ulong4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(ulong4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(ulong4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(ulong4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(ulong4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(ulong4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(ulong4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(ulong4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(ulong4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(ulong4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(ulong4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(ulong4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(ulong4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(ulong4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(int4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(int4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(int4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(int4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(int4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(int4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(int4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(int4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(int4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(int4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(int4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(int4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(int4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(int4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(int4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(int4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(int4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(int4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(int4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(int4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(int4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(int4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(int4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(int4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(int4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(int4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(int4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(int4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(int4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(int4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(int4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(int4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(int4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(int4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(int4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(int4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(uint4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(uint4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(uint4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(uint4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(uint4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(uint4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(uint4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(uint4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(uint4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(uint4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(uint4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(uint4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(uint4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(uint4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(uint4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(uint4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(uint4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(uint4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(uint4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(uint4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(uint4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(uint4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(uint4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(uint4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(uint4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(uint4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(uint4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(uint4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(uint4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(uint4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(uint4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(uint4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(uint4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(uint4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(uint4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(uint4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(short4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(short4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(short4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(short4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(short4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(short4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(short4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(short4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(short4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(short4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(short4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(short4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(short4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(short4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(short4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(short4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(short4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(short4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(short4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(short4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(short4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(short4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(short4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(short4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(short4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(short4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(short4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(short4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(short4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(short4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(short4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(short4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(short4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(short4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(short4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(short4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(ushort4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(ushort4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(ushort4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(ushort4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(ushort4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(ushort4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(ushort4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(ushort4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(ushort4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(ushort4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(ushort4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(ushort4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(ushort4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(ushort4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(ushort4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(ushort4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(ushort4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(ushort4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(ushort4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(ushort4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(ushort4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(ushort4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(ushort4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(ushort4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(ushort4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(ushort4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(ushort4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(ushort4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(ushort4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(ushort4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(ushort4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(ushort4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(ushort4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(ushort4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(ushort4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(ushort4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(char4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(char4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(char4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(char4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(char4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(char4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(char4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(char4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(char4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(char4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(char4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(char4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(char4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(char4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(char4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(char4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(char4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(char4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(char4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(char4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(char4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(char4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(char4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(char4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(char4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(char4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(char4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(char4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(char4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(char4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(char4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(char4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(char4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(char4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(char4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(char4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(uchar4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(uchar4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(uchar4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(uchar4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(uchar4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(uchar4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(uchar4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(uchar4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(uchar4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(uchar4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(uchar4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(uchar4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(uchar4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(uchar4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(uchar4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(uchar4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(uchar4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(uchar4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(uchar4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(uchar4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(uchar4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(uchar4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(uchar4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(uchar4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(uchar4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(uchar4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(uchar4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(uchar4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(uchar4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(uchar4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(uchar4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(uchar4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(uchar4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(uchar4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(uchar4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(uchar4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rte(float4 v) {
+  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtz(float4 v) {
+  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtp(float4 v) {
+  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_rtn(float4 v) {
+  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rte(float4 v) {
+  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(float4 v) {
+  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(float4 v) {
+  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(float4 v) {
+  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rte(float4 v) {
+  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtz(float4 v) {
+  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtp(float4 v) {
+  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_rtn(float4 v) {
+  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rte(float4 v) {
+  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtz(float4 v) {
+  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtp(float4 v) {
+  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_rtn(float4 v) {
+  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rte(float4 v) {
+  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtz(float4 v) {
+  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtp(float4 v) {
+  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_rtn(float4 v) {
+  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rte(float4 v) {
+  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(float4 v) {
+  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(float4 v) {
+  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(float4 v) {
+  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rte(float4 v) {
+  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtz(float4 v) {
+  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtp(float4 v) {
+  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_rtn(float4 v) {
+  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rte(float4 v) {
+  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(float4 v) {
+  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(float4 v) {
+  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(float4 v) {
+  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rte(float4 v) {
+  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtz(float4 v) {
+  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtp(float4 v) {
+  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE float4 convert_float4_rtn(float4 v) {
+  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(long8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(long8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(long8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(long8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(long8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(long8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(long8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(long8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(long8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(long8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(long8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(long8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(long8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(long8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(long8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(long8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(long8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(long8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(long8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(long8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(long8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(long8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(long8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(long8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(long8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(long8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(long8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(long8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(long8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(long8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(long8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(long8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(long8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(long8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(long8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(long8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(ulong8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(ulong8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(ulong8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(ulong8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(ulong8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(ulong8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(ulong8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(ulong8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(ulong8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(ulong8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(ulong8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(ulong8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(ulong8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(ulong8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(ulong8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(ulong8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(ulong8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(ulong8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(ulong8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(ulong8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(ulong8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(ulong8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(ulong8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(ulong8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(ulong8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(ulong8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(ulong8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(ulong8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(ulong8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(ulong8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(ulong8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(ulong8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(ulong8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(ulong8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(ulong8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(ulong8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(int8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(int8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(int8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(int8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(int8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(int8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(int8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(int8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(int8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(int8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(int8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(int8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(int8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(int8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(int8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(int8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(int8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(int8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(int8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(int8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(int8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(int8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(int8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(int8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(int8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(int8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(int8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(int8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(int8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(int8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(int8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(int8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(int8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(int8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(int8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(int8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(uint8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(uint8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(uint8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(uint8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(uint8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(uint8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(uint8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(uint8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(uint8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(uint8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(uint8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(uint8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(uint8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(uint8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(uint8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(uint8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(uint8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(uint8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(uint8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(uint8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(uint8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(uint8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(uint8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(uint8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(uint8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(uint8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(uint8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(uint8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(uint8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(uint8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(uint8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(uint8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(uint8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(uint8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(uint8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(uint8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(short8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(short8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(short8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(short8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(short8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(short8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(short8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(short8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(short8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(short8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(short8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(short8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(short8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(short8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(short8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(short8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(short8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(short8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(short8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(short8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(short8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(short8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(short8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(short8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(short8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(short8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(short8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(short8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(short8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(short8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(short8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(short8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(short8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(short8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(short8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(short8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(ushort8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(ushort8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(ushort8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(ushort8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(ushort8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(ushort8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(ushort8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(ushort8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(ushort8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(ushort8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(ushort8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(ushort8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(ushort8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(ushort8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(ushort8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(ushort8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(ushort8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(ushort8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(ushort8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(ushort8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(ushort8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(ushort8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(ushort8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(ushort8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(ushort8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(ushort8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(ushort8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(ushort8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(ushort8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(ushort8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(ushort8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(ushort8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(ushort8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(ushort8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(ushort8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(ushort8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(char8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(char8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(char8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(char8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(char8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(char8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(char8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(char8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(char8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(char8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(char8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(char8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(char8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(char8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(char8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(char8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(char8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(char8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(char8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(char8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(char8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(char8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(char8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(char8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(char8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(char8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(char8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(char8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(char8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(char8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(char8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(char8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(char8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(char8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(char8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(char8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(uchar8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(uchar8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(uchar8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(uchar8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(uchar8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(uchar8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(uchar8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(uchar8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(uchar8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(uchar8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(uchar8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(uchar8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(uchar8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(uchar8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(uchar8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(uchar8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(uchar8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(uchar8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(uchar8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(uchar8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(uchar8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(uchar8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(uchar8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(uchar8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(uchar8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(uchar8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(uchar8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(uchar8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(uchar8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(uchar8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(uchar8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(uchar8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(uchar8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(uchar8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(uchar8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(uchar8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rte(float8 v) {
+  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtz(float8 v) {
+  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtp(float8 v) {
+  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_rtn(float8 v) {
+  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rte(float8 v) {
+  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(float8 v) {
+  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(float8 v) {
+  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(float8 v) {
+  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rte(float8 v) {
+  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtz(float8 v) {
+  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtp(float8 v) {
+  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_rtn(float8 v) {
+  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rte(float8 v) {
+  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtz(float8 v) {
+  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtp(float8 v) {
+  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_rtn(float8 v) {
+  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rte(float8 v) {
+  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtz(float8 v) {
+  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtp(float8 v) {
+  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_rtn(float8 v) {
+  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rte(float8 v) {
+  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(float8 v) {
+  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(float8 v) {
+  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(float8 v) {
+  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rte(float8 v) {
+  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtz(float8 v) {
+  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtp(float8 v) {
+  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_rtn(float8 v) {
+  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rte(float8 v) {
+  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(float8 v) {
+  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(float8 v) {
+  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(float8 v) {
+  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rte(float8 v) {
+  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtz(float8 v) {
+  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtp(float8 v) {
+  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE float8 convert_float8_rtn(float8 v) {
+  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(long16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(long16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(long16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(long16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(long16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(long16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(long16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(long16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(long16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(long16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(long16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(long16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(long16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(long16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(long16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(long16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(long16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(long16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(long16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(long16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(long16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(long16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(long16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(long16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(long16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(long16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(long16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(long16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(long16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(long16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(long16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(long16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(long16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(long16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(long16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(long16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(ulong16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(ulong16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(ulong16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(ulong16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(ulong16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(ulong16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(ulong16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(ulong16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(ulong16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(ulong16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(ulong16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(ulong16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(ulong16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(ulong16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(ulong16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(ulong16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(ulong16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(ulong16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(ulong16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(ulong16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(ulong16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(ulong16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(ulong16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(ulong16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(ulong16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(ulong16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(ulong16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(ulong16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(ulong16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(ulong16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(ulong16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(ulong16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(ulong16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(ulong16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(ulong16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(ulong16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(int16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(int16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(int16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(int16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(int16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(int16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(int16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(int16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(int16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(int16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(int16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(int16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(int16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(int16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(int16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(int16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(int16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(int16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(int16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(int16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(int16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(int16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(int16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(int16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(int16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(int16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(int16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(int16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(int16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(int16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(int16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(int16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(int16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(int16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(int16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(int16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(uint16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(uint16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(uint16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(uint16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(uint16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(uint16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(uint16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(uint16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(uint16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(uint16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(uint16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(uint16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(uint16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(uint16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(uint16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(uint16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(uint16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(uint16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(uint16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(uint16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(uint16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(uint16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(uint16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(uint16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(uint16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(uint16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(uint16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(uint16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(uint16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(uint16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(uint16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(uint16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(uint16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(uint16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(uint16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(uint16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(short16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(short16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(short16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(short16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(short16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(short16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(short16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(short16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(short16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(short16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(short16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(short16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(short16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(short16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(short16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(short16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(short16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(short16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(short16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(short16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(short16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(short16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(short16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(short16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(short16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(short16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(short16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(short16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(short16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(short16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(short16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(short16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(short16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(short16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(short16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(short16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(ushort16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(ushort16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(ushort16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(ushort16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(ushort16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(ushort16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(ushort16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(ushort16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(ushort16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(ushort16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(ushort16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(ushort16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(ushort16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(ushort16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(ushort16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(ushort16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(ushort16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(ushort16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(ushort16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(ushort16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(ushort16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(ushort16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(ushort16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(ushort16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(ushort16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(ushort16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(ushort16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(ushort16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(ushort16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(ushort16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(ushort16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(ushort16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(ushort16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(ushort16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(ushort16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(ushort16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(char16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(char16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(char16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(char16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(char16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(char16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(char16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(char16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(char16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(char16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(char16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(char16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(char16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(char16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(char16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(char16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(char16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(char16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(char16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(char16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(char16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(char16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(char16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(char16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(char16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(char16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(char16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(char16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(char16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(char16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(char16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(char16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(char16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(char16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(char16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(char16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(uchar16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(uchar16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(uchar16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(uchar16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(uchar16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(uchar16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(uchar16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(uchar16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(uchar16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(uchar16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(uchar16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(uchar16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(uchar16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(uchar16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(uchar16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(uchar16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(uchar16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(uchar16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(uchar16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(uchar16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(uchar16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(uchar16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(uchar16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(uchar16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(uchar16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(uchar16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(uchar16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(uchar16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(uchar16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(uchar16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(uchar16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(uchar16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(uchar16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(uchar16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(uchar16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(uchar16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rte(float16 v) {
+  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtz(float16 v) {
+  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtp(float16 v) {
+  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_rtn(float16 v) {
+  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rte(float16 v) {
+  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(float16 v) {
+  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(float16 v) {
+  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(float16 v) {
+  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rte(float16 v) {
+  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtz(float16 v) {
+  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtp(float16 v) {
+  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_rtn(float16 v) {
+  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rte(float16 v) {
+  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtz(float16 v) {
+  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtp(float16 v) {
+  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_rtn(float16 v) {
+  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rte(float16 v) {
+  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtz(float16 v) {
+  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtp(float16 v) {
+  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_rtn(float16 v) {
+  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rte(float16 v) {
+  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(float16 v) {
+  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(float16 v) {
+  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(float16 v) {
+  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rte(float16 v) {
+  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtz(float16 v) {
+  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtp(float16 v) {
+  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_rtn(float16 v) {
+  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rte(float16 v) {
+  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(float16 v) {
+  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(float16 v) {
+  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(float16 v) {
+  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rte(float16 v) {
+  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtz(float16 v) {
+  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtp(float16 v) {
+  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE float16 convert_float16_rtn(float16 v) {
+  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
+}
+
+INLINE_OVERLOADABLE long convert_long_sat_rte(long x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(long x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(long x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(long x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(long x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(long x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(long x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(long x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(long x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(long x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(long x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(long x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(long x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(long x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(long x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(long x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(long x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(long x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(long x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(long x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(long x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(long x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(long x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(long x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(long x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(long x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(long x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(long x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(long x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(long x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(long x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(long x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(ulong x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(ulong x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(ulong x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(ulong x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(ulong x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(ulong x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(ulong x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(ulong x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(ulong x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(ulong x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(ulong x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(ulong x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(ulong x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(ulong x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(ulong x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(ulong x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(ulong x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(ulong x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(ulong x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(ulong x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(ulong x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(ulong x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(ulong x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(ulong x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(ulong x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(ulong x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(ulong x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(ulong x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(ulong x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(ulong x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(ulong x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(ulong x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(int x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(int x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(int x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(int x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(int x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(int x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(int x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(int x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(int x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(int x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(int x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(int x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(int x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(int x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(int x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(int x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(int x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(int x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(int x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(int x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(int x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(int x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(int x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(int x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(int x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(int x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(int x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(int x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(int x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(int x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(int x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(int x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(uint x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(uint x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(uint x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(uint x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(uint x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(uint x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(uint x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(uint x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(uint x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(uint x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(uint x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(uint x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(uint x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(uint x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(uint x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(uint x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(uint x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(uint x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(uint x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(uint x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(uint x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(uint x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(uint x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(uint x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(uint x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(uint x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(uint x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(uint x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(uint x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(uint x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(uint x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(uint x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(short x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(short x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(short x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(short x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(short x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(short x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(short x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(short x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(short x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(short x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(short x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(short x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(short x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(short x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(short x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(short x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(short x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(short x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(short x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(short x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(short x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(short x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(short x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(short x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(short x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(short x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(short x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(short x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(short x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(short x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(short x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(short x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(ushort x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(ushort x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(ushort x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(ushort x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(ushort x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(ushort x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(ushort x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(ushort x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(ushort x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(ushort x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(ushort x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(ushort x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(ushort x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(ushort x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(ushort x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(ushort x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(ushort x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(ushort x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(ushort x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(ushort x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(ushort x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(ushort x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(ushort x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(ushort x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(ushort x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(ushort x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(ushort x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(ushort x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(ushort x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(ushort x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(ushort x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(ushort x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(char x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(char x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(char x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(char x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(char x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(char x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(char x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(char x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(char x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(char x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(char x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(char x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(char x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(char x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(char x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(char x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(char x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(char x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(char x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(char x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(char x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(char x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(char x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(char x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(char x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(char x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(char x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(char x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(char x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(char x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(char x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(char x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(uchar x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(uchar x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(uchar x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(uchar x)
+{ return convert_long_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(uchar x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(uchar x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(uchar x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(uchar x)
+{ return convert_ulong_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(uchar x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(uchar x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(uchar x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(uchar x)
+{ return convert_int_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(uchar x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(uchar x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(uchar x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(uchar x)
+{ return convert_uint_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(uchar x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(uchar x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(uchar x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(uchar x)
+{ return convert_short_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(uchar x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(uchar x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(uchar x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(uchar x)
+{ return convert_ushort_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(uchar x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(uchar x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(uchar x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(uchar x)
+{ return convert_char_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(uchar x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(uchar x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(uchar x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(uchar x)
+{ return convert_uchar_sat(x); }
+INLINE_OVERLOADABLE long convert_long_sat_rte(float x)
+{ return convert_long_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE long convert_long_sat_rtz(float x)
+{ return convert_long_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE long convert_long_sat_rtp(float x)
+{ return convert_long_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE long convert_long_sat_rtn(float x)
+{ return convert_long_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(float x)
+{ return convert_ulong_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(float x)
+{ return convert_ulong_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(float x)
+{ return convert_ulong_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(float x)
+{ return convert_ulong_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE int convert_int_sat_rte(float x)
+{ return convert_int_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE int convert_int_sat_rtz(float x)
+{ return convert_int_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE int convert_int_sat_rtp(float x)
+{ return convert_int_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE int convert_int_sat_rtn(float x)
+{ return convert_int_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rte(float x)
+{ return convert_uint_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtz(float x)
+{ return convert_uint_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtp(float x)
+{ return convert_uint_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE uint convert_uint_sat_rtn(float x)
+{ return convert_uint_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE short convert_short_sat_rte(float x)
+{ return convert_short_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE short convert_short_sat_rtz(float x)
+{ return convert_short_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE short convert_short_sat_rtp(float x)
+{ return convert_short_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE short convert_short_sat_rtn(float x)
+{ return convert_short_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(float x)
+{ return convert_ushort_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(float x)
+{ return convert_ushort_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(float x)
+{ return convert_ushort_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(float x)
+{ return convert_ushort_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE char convert_char_sat_rte(float x)
+{ return convert_char_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE char convert_char_sat_rtz(float x)
+{ return convert_char_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE char convert_char_sat_rtp(float x)
+{ return convert_char_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE char convert_char_sat_rtn(float x)
+{ return convert_char_sat(__gen_ocl_rndd(x)); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(float x)
+{ return convert_uchar_sat(__gen_ocl_rnde(x)); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(float x)
+{ return convert_uchar_sat(__gen_ocl_rndz(x)); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(float x)
+{ return convert_uchar_sat(__gen_ocl_rndu(x)); }
+INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(float x)
+{ return convert_uchar_sat(__gen_ocl_rndd(x)); }
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(long2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(long2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(long2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(long2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(long2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(long2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(long2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(long2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(long2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(long2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(long2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(long2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(long2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(long2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(long2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(long2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(long2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(long2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(long2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(long2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(long2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(long2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(long2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(long2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(long2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(long2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(long2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(long2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(long2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(long2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(long2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(long2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(ulong2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(ulong2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(ulong2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(ulong2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(ulong2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(ulong2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(ulong2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(ulong2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(ulong2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(ulong2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(ulong2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(ulong2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(ulong2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(ulong2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(ulong2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(ulong2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(ulong2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(ulong2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(ulong2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(ulong2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(ulong2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(ulong2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(ulong2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(ulong2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(ulong2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(ulong2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(ulong2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(ulong2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(ulong2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(ulong2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(ulong2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(ulong2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(int2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(int2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(int2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(int2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(int2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(int2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(int2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(int2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(int2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(int2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(int2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(int2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(int2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(int2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(int2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(int2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(int2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(int2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(int2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(int2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(int2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(int2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(int2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(int2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(int2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(int2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(int2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(int2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(int2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(int2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(int2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(int2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(uint2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(uint2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(uint2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(uint2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(uint2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(uint2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(uint2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(uint2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(uint2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(uint2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(uint2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(uint2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(uint2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(uint2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(uint2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(uint2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(uint2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(uint2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(uint2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(uint2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(uint2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(uint2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(uint2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(uint2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(uint2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(uint2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(uint2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(uint2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(uint2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(uint2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(uint2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(uint2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(short2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(short2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(short2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(short2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(short2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(short2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(short2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(short2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(short2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(short2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(short2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(short2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(short2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(short2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(short2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(short2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(short2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(short2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(short2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(short2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(short2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(short2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(short2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(short2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(short2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(short2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(short2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(short2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(short2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(short2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(short2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(short2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(ushort2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(ushort2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(ushort2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(ushort2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(ushort2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(ushort2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(ushort2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(ushort2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(ushort2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(ushort2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(ushort2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(ushort2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(ushort2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(ushort2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(ushort2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(ushort2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(ushort2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(ushort2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(ushort2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(ushort2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(ushort2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(ushort2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(ushort2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(ushort2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(ushort2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(ushort2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(ushort2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(ushort2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(ushort2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(ushort2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(ushort2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(ushort2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(char2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(char2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(char2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(char2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(char2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(char2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(char2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(char2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(char2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(char2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(char2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(char2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(char2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(char2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(char2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(char2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(char2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(char2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(char2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(char2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(char2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(char2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(char2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(char2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(char2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(char2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(char2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(char2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(char2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(char2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(char2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(char2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(uchar2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(uchar2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(uchar2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(uchar2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(uchar2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(uchar2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(uchar2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(uchar2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(uchar2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(uchar2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(uchar2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(uchar2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(uchar2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(uchar2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(uchar2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(uchar2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(uchar2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(uchar2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(uchar2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(uchar2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(uchar2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(uchar2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(uchar2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(uchar2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(uchar2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(uchar2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(uchar2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(uchar2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(uchar2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(uchar2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(uchar2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(uchar2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rte(float2 v) {
+  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtz(float2 v) {
+  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtp(float2 v) {
+  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE long2 convert_long2_sat_rtn(float2 v) {
+  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(float2 v) {
+  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(float2 v) {
+  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(float2 v) {
+  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(float2 v) {
+  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rte(float2 v) {
+  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtz(float2 v) {
+  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtp(float2 v) {
+  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE int2 convert_int2_sat_rtn(float2 v) {
+  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(float2 v) {
+  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(float2 v) {
+  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(float2 v) {
+  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(float2 v) {
+  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rte(float2 v) {
+  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtz(float2 v) {
+  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtp(float2 v) {
+  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE short2 convert_short2_sat_rtn(float2 v) {
+  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(float2 v) {
+  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(float2 v) {
+  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(float2 v) {
+  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(float2 v) {
+  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rte(float2 v) {
+  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtz(float2 v) {
+  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtp(float2 v) {
+  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE char2 convert_char2_sat_rtn(float2 v) {
+  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(float2 v) {
+  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(float2 v) {
+  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(float2 v) {
+  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
+}
+
+INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(float2 v) {
+  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(long3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(long3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(long3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(long3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(long3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(long3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(long3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(long3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(long3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(long3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(long3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(long3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(long3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(long3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(long3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(long3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(long3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(long3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(long3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(long3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(long3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(long3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(long3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(long3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(long3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(long3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(long3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(long3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(long3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(long3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(long3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(long3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(ulong3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(ulong3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(ulong3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(ulong3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(ulong3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(ulong3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(ulong3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(ulong3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(ulong3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(ulong3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(ulong3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(ulong3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(ulong3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(ulong3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(ulong3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(ulong3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(ulong3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(ulong3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(ulong3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(ulong3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(ulong3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(ulong3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(ulong3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(ulong3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(ulong3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(ulong3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(ulong3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(ulong3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(ulong3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(ulong3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(ulong3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(ulong3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(int3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(int3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(int3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(int3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(int3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(int3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(int3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(int3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(int3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(int3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(int3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(int3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(int3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(int3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(int3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(int3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(int3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(int3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(int3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(int3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(int3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(int3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(int3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(int3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(int3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(int3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(int3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(int3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(int3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(int3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(int3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(int3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(uint3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(uint3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(uint3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(uint3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(uint3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(uint3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(uint3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(uint3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(uint3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(uint3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(uint3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(uint3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(uint3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(uint3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(uint3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(uint3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(uint3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(uint3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(uint3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(uint3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(uint3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(uint3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(uint3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(uint3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(uint3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(uint3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(uint3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(uint3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(uint3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(uint3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(uint3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(uint3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(short3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(short3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(short3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(short3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(short3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(short3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(short3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(short3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(short3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(short3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(short3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(short3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(short3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(short3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(short3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(short3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(short3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(short3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(short3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(short3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(short3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(short3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(short3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(short3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(short3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(short3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(short3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(short3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(short3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(short3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(short3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(short3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(ushort3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(ushort3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(ushort3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(ushort3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(ushort3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(ushort3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(ushort3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(ushort3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(ushort3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(ushort3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(ushort3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(ushort3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(ushort3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(ushort3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(ushort3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(ushort3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(ushort3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(ushort3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(ushort3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(ushort3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(ushort3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(ushort3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(ushort3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(ushort3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(ushort3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(ushort3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(ushort3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(ushort3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(ushort3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(ushort3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(ushort3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(ushort3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(char3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(char3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(char3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(char3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(char3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(char3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(char3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(char3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(char3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(char3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(char3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(char3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(char3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(char3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(char3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(char3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(char3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(char3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(char3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(char3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(char3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(char3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(char3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(char3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(char3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(char3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(char3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(char3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(char3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(char3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(char3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(char3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(uchar3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(uchar3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(uchar3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(uchar3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(uchar3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(uchar3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(uchar3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(uchar3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(uchar3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(uchar3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(uchar3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(uchar3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(uchar3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(uchar3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(uchar3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(uchar3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(uchar3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(uchar3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(uchar3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(uchar3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(uchar3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(uchar3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(uchar3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(uchar3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(uchar3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(uchar3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(uchar3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(uchar3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(uchar3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(uchar3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(uchar3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(uchar3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rte(float3 v) {
+  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtz(float3 v) {
+  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtp(float3 v) {
+  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE long3 convert_long3_sat_rtn(float3 v) {
+  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(float3 v) {
+  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(float3 v) {
+  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(float3 v) {
+  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(float3 v) {
+  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rte(float3 v) {
+  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtz(float3 v) {
+  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtp(float3 v) {
+  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE int3 convert_int3_sat_rtn(float3 v) {
+  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(float3 v) {
+  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(float3 v) {
+  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(float3 v) {
+  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(float3 v) {
+  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rte(float3 v) {
+  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtz(float3 v) {
+  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtp(float3 v) {
+  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE short3 convert_short3_sat_rtn(float3 v) {
+  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(float3 v) {
+  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(float3 v) {
+  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(float3 v) {
+  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(float3 v) {
+  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rte(float3 v) {
+  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtz(float3 v) {
+  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtp(float3 v) {
+  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE char3 convert_char3_sat_rtn(float3 v) {
+  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(float3 v) {
+  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(float3 v) {
+  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(float3 v) {
+  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
+}
+
+INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(float3 v) {
+  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(long4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(long4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(long4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(long4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(long4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(long4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(long4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(long4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(long4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(long4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(long4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(long4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(long4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(long4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(long4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(long4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(long4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(long4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(long4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(long4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(long4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(long4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(long4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(long4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(long4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(long4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(long4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(long4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(long4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(long4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(long4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(long4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(ulong4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(ulong4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(ulong4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(ulong4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(ulong4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(ulong4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(ulong4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(ulong4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(ulong4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(ulong4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(ulong4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(ulong4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(ulong4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(ulong4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(ulong4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(ulong4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(ulong4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(ulong4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(ulong4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(ulong4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(ulong4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(ulong4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(ulong4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(ulong4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(ulong4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(ulong4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(ulong4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(ulong4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(ulong4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(ulong4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(ulong4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(ulong4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(int4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(int4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(int4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(int4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(int4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(int4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(int4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(int4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(int4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(int4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(int4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(int4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(int4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(int4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(int4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(int4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(int4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(int4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(int4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(int4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(int4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(int4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(int4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(int4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(int4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(int4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(int4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(int4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(int4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(int4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(int4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(int4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(uint4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(uint4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(uint4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(uint4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(uint4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(uint4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(uint4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(uint4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(uint4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(uint4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(uint4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(uint4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(uint4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(uint4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(uint4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(uint4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(uint4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(uint4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(uint4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(uint4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(uint4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(uint4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(uint4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(uint4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(uint4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(uint4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(uint4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(uint4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(uint4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(uint4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(uint4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(uint4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(short4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(short4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(short4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(short4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(short4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(short4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(short4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(short4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(short4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(short4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(short4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(short4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(short4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(short4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(short4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(short4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(short4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(short4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(short4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(short4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(short4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(short4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(short4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(short4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(short4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(short4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(short4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(short4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(short4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(short4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(short4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(short4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(ushort4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(ushort4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(ushort4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(ushort4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(ushort4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(ushort4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(ushort4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(ushort4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(ushort4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(ushort4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(ushort4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(ushort4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(ushort4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(ushort4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(ushort4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(ushort4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(ushort4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(ushort4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(ushort4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(ushort4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(ushort4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(ushort4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(ushort4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(ushort4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(ushort4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(ushort4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(ushort4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(ushort4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(ushort4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(ushort4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(ushort4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(ushort4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(char4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(char4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(char4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(char4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(char4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(char4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(char4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(char4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(char4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(char4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(char4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(char4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(char4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(char4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(char4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(char4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(char4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(char4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(char4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(char4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(char4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(char4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(char4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(char4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(char4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(char4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(char4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(char4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(char4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(char4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(char4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(char4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(uchar4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(uchar4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(uchar4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(uchar4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(uchar4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(uchar4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(uchar4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(uchar4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(uchar4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(uchar4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(uchar4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(uchar4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(uchar4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(uchar4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(uchar4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(uchar4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(uchar4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(uchar4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(uchar4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(uchar4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(uchar4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(uchar4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(uchar4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(uchar4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(uchar4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(uchar4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(uchar4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(uchar4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(uchar4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(uchar4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(uchar4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(uchar4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rte(float4 v) {
+  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtz(float4 v) {
+  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtp(float4 v) {
+  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE long4 convert_long4_sat_rtn(float4 v) {
+  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(float4 v) {
+  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(float4 v) {
+  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(float4 v) {
+  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(float4 v) {
+  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rte(float4 v) {
+  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtz(float4 v) {
+  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtp(float4 v) {
+  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE int4 convert_int4_sat_rtn(float4 v) {
+  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(float4 v) {
+  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(float4 v) {
+  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(float4 v) {
+  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(float4 v) {
+  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rte(float4 v) {
+  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtz(float4 v) {
+  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtp(float4 v) {
+  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE short4 convert_short4_sat_rtn(float4 v) {
+  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(float4 v) {
+  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(float4 v) {
+  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(float4 v) {
+  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(float4 v) {
+  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rte(float4 v) {
+  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtz(float4 v) {
+  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtp(float4 v) {
+  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE char4 convert_char4_sat_rtn(float4 v) {
+  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(float4 v) {
+  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(float4 v) {
+  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(float4 v) {
+  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
+}
+
+INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(float4 v) {
+  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(long8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(long8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(long8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(long8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(long8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(long8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(long8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(long8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(long8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(long8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(long8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(long8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(long8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(long8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(long8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(long8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(long8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(long8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(long8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(long8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(long8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(long8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(long8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(long8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(long8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(long8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(long8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(long8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(long8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(long8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(long8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(long8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(ulong8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(ulong8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(ulong8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(ulong8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(ulong8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(ulong8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(ulong8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(ulong8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(ulong8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(ulong8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(ulong8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(ulong8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(ulong8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(ulong8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(ulong8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(ulong8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(ulong8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(ulong8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(ulong8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(ulong8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(ulong8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(ulong8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(ulong8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(ulong8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(ulong8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(ulong8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(ulong8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(ulong8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(ulong8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(ulong8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(ulong8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(ulong8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(int8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(int8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(int8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(int8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(int8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(int8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(int8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(int8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(int8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(int8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(int8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(int8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(int8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(int8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(int8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(int8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(int8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(int8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(int8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(int8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(int8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(int8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(int8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(int8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(int8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(int8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(int8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(int8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(int8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(int8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(int8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(int8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(uint8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(uint8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(uint8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(uint8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(uint8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(uint8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(uint8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(uint8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(uint8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(uint8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(uint8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(uint8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(uint8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(uint8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(uint8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(uint8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(uint8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(uint8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(uint8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(uint8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(uint8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(uint8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(uint8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(uint8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(uint8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(uint8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(uint8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(uint8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(uint8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(uint8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(uint8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(uint8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(short8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(short8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(short8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(short8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(short8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(short8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(short8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(short8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(short8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(short8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(short8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(short8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(short8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(short8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(short8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(short8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(short8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(short8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(short8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(short8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(short8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(short8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(short8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(short8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(short8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(short8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(short8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(short8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(short8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(short8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(short8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(short8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(ushort8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(ushort8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(ushort8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(ushort8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(ushort8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(ushort8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(ushort8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(ushort8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(ushort8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(ushort8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(ushort8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(ushort8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(ushort8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(ushort8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(ushort8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(ushort8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(ushort8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(ushort8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(ushort8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(ushort8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(ushort8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(ushort8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(ushort8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(ushort8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(ushort8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(ushort8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(ushort8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(ushort8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(ushort8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(ushort8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(ushort8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(ushort8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(char8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(char8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(char8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(char8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(char8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(char8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(char8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(char8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(char8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(char8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(char8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(char8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(char8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(char8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(char8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(char8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(char8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(char8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(char8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(char8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(char8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(char8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(char8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(char8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(char8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(char8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(char8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(char8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(char8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(char8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(char8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(char8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(uchar8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(uchar8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(uchar8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(uchar8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(uchar8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(uchar8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(uchar8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(uchar8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(uchar8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(uchar8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(uchar8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(uchar8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(uchar8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(uchar8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(uchar8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(uchar8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(uchar8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(uchar8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(uchar8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(uchar8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(uchar8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(uchar8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(uchar8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(uchar8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(uchar8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(uchar8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(uchar8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(uchar8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(uchar8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(uchar8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(uchar8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(uchar8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rte(float8 v) {
+  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtz(float8 v) {
+  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtp(float8 v) {
+  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE long8 convert_long8_sat_rtn(float8 v) {
+  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(float8 v) {
+  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(float8 v) {
+  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(float8 v) {
+  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(float8 v) {
+  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rte(float8 v) {
+  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtz(float8 v) {
+  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtp(float8 v) {
+  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE int8 convert_int8_sat_rtn(float8 v) {
+  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(float8 v) {
+  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(float8 v) {
+  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(float8 v) {
+  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(float8 v) {
+  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rte(float8 v) {
+  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtz(float8 v) {
+  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtp(float8 v) {
+  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE short8 convert_short8_sat_rtn(float8 v) {
+  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(float8 v) {
+  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(float8 v) {
+  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(float8 v) {
+  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(float8 v) {
+  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rte(float8 v) {
+  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtz(float8 v) {
+  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtp(float8 v) {
+  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE char8 convert_char8_sat_rtn(float8 v) {
+  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(float8 v) {
+  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(float8 v) {
+  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(float8 v) {
+  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
+}
+
+INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(float8 v) {
+  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(long16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(long16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(long16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(long16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(long16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(long16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(long16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(long16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(long16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(long16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(long16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(long16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(long16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(long16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(long16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(long16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(long16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(long16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(long16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(long16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(long16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(long16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(long16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(long16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(long16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(long16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(long16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(long16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(long16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(long16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(long16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(long16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(ulong16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(ulong16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(ulong16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(ulong16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(ulong16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(ulong16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(ulong16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(ulong16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(ulong16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(ulong16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(ulong16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(ulong16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(ulong16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(ulong16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(ulong16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(ulong16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(ulong16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(ulong16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(ulong16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(ulong16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(ulong16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(ulong16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(ulong16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(ulong16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(ulong16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(ulong16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(ulong16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(ulong16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(ulong16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(ulong16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(ulong16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(ulong16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(int16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(int16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(int16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(int16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(int16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(int16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(int16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(int16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(int16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(int16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(int16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(int16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(int16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(int16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(int16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(int16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(int16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(int16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(int16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(int16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(int16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(int16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(int16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(int16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(int16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(int16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(int16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(int16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(int16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(int16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(int16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(int16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(uint16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(uint16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(uint16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(uint16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(uint16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(uint16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(uint16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(uint16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(uint16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(uint16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(uint16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(uint16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(uint16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(uint16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(uint16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(uint16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(uint16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(uint16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(uint16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(uint16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(uint16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(uint16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(uint16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(uint16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(uint16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(uint16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(uint16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(uint16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(uint16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(uint16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(uint16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(uint16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(short16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(short16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(short16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(short16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(short16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(short16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(short16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(short16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(short16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(short16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(short16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(short16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(short16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(short16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(short16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(short16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(short16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(short16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(short16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(short16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(short16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(short16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(short16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(short16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(short16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(short16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(short16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(short16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(short16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(short16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(short16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(short16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(ushort16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(ushort16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(ushort16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(ushort16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(ushort16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(ushort16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(ushort16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(ushort16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(ushort16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(ushort16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(ushort16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(ushort16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(ushort16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(ushort16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(ushort16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(ushort16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(ushort16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(ushort16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(ushort16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(ushort16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(ushort16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(ushort16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(ushort16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(ushort16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(ushort16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(ushort16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(ushort16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(ushort16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(ushort16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(ushort16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(ushort16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(ushort16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(char16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(char16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(char16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(char16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(char16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(char16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(char16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(char16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(char16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(char16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(char16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(char16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(char16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(char16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(char16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(char16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(char16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(char16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(char16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(char16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(char16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(char16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(char16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(char16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(char16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(char16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(char16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(char16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(char16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(char16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(char16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(char16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(uchar16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(uchar16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(uchar16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(uchar16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(uchar16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(uchar16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(uchar16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(uchar16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(uchar16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(uchar16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(uchar16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(uchar16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(uchar16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(uchar16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(uchar16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(uchar16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(uchar16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(uchar16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(uchar16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(uchar16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(uchar16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(uchar16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(uchar16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(uchar16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(uchar16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(uchar16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(uchar16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(uchar16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(uchar16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(uchar16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(uchar16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(uchar16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rte(float16 v) {
+  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtz(float16 v) {
+  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtp(float16 v) {
+  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE long16 convert_long16_sat_rtn(float16 v) {
+  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(float16 v) {
+  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(float16 v) {
+  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(float16 v) {
+  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(float16 v) {
+  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rte(float16 v) {
+  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtz(float16 v) {
+  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtp(float16 v) {
+  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE int16 convert_int16_sat_rtn(float16 v) {
+  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(float16 v) {
+  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(float16 v) {
+  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(float16 v) {
+  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(float16 v) {
+  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rte(float16 v) {
+  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtz(float16 v) {
+  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtp(float16 v) {
+  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE short16 convert_short16_sat_rtn(float16 v) {
+  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(float16 v) {
+  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(float16 v) {
+  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(float16 v) {
+  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
+}
+
+INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(float16 v) {
+  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rte(float16 v) {
+  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtz(float16 v) {
+  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtp(float16 v) {
+  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE char16 convert_char16_sat_rtn(float16 v) {
+  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(float16 v) {
+  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(float16 v) {
+  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(float16 v) {
+  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
+}
+
+INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(float16 v) {
+  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
+}
+
diff --git a/backend/src/ocl_memcpy.ll b/backend/src/ocl_memcpy.ll
new file mode 100644
index 0000000..476033e
--- /dev/null
+++ b/backend/src/ocl_memcpy.ll
@@ -0,0 +1,336 @@
+;The memcpy's source code.
+; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) {
+;   size_t index = 0;
+;   while((index + 4) >= size) {
+;     *((uint *)(dst + index)) = *((uint *)(src + index));
+;     index += 4;
+;   }
+;   while(index < size) {
+;     dst[index] = src[index];
+;     index++;
+;   }
+; }
+
+define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %1 = load i32 addrspace(1)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  store i32 %1, i32 addrspace(1)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
+  %3 = load i8 addrspace(1)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
+  %1 = load i32 addrspace(0)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  store i32 %1, i32 addrspace(1)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
+  %3 = load i8 addrspace(0)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+  %1 = load i32 addrspace(3)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)*
+  store i32 %1, i32 addrspace(1)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
+  %3 = load i8 addrspace(3)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(1)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %1 = load i32 addrspace(1)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
+  store i32 %1, i32 addrspace(0)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
+  %3 = load i8 addrspace(1)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
+  %1 = load i32 addrspace(0)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
+  store i32 %1, i32 addrspace(0)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
+  %3 = load i8 addrspace(0)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+  %1 = load i32 addrspace(3)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)*
+  store i32 %1, i32 addrspace(0)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
+  %3 = load i8 addrspace(3)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(0)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  %1 = load i32 addrspace(1)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
+  store i32 %1, i32 addrspace(3)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1
+  %3 = load i8 addrspace(1)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)*
+  %1 = load i32 addrspace(0)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
+  store i32 %1, i32 addrspace(3)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1
+  %3 = load i8 addrspace(0)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
+
+define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond3, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0
+  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+  %1 = load i32 addrspace(3)* %0, align 4
+  %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+  %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)*
+  store i32 %1, i32 addrspace(3)* %2, align 4
+  br label %while.cond
+
+while.cond3:                                      ; preds = %while.cond, %while.body5
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ]
+  %cmp4 = icmp ult i32 %index.1, %size
+  br i1 %cmp4, label %while.body5, label %while.end7
+
+while.body5:                                      ; preds = %while.cond3
+  %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1
+  %3 = load i8 addrspace(3)* %arrayidx, align 1
+  %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+  store i8 %3, i8 addrspace(3)* %arrayidx6, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond3
+
+while.end7:                                       ; preds = %while.cond3
+  ret void
+}
diff --git a/backend/src/ocl_memset.ll b/backend/src/ocl_memset.ll
new file mode 100644
index 0000000..addf9f5
--- /dev/null
+++ b/backend/src/ocl_memset.ll
@@ -0,0 +1,127 @@
+;The memset's source code.
+; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) {
+;   size_t index = 0;
+;   uint v = (val << 24) | (val << 16) | (val << 8) | val;
+;   while((index + 4) >= size) {
+;     *((uint *)(dst + index)) = v;
+;     index += 4;
+;   }
+;   while(index < size) {
+;     dst[index] = val;
+;     index++;
+;  }
+; }
+
+define void @__gen_memset_p(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+  %conv = zext i8 %val to i32
+  %shl = shl nuw i32 %conv, 24
+  %shl2 = shl nuw nsw i32 %conv, 16
+  %or = or i32 %shl, %shl2
+  %shl4 = shl nuw nsw i32 %conv, 8
+  %or5 = or i32 %or, %shl4
+  %or7 = or i32 %or5, %conv
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond10, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8* %dst, i32 %index.0
+  %0 = bitcast i8* %add.ptr to i32*
+  store i32 %or7, i32* %0, align 4
+  br label %while.cond
+
+while.cond10:                                     ; preds = %while.cond, %while.body13
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
+  %cmp11 = icmp ult i32 %index.1, %size
+  br i1 %cmp11, label %while.body13, label %while.end14
+
+while.body13:                                     ; preds = %while.cond10
+  %arrayidx = getelementptr inbounds i8* %dst, i32 %index.1
+  store i8 %val, i8* %arrayidx, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond10
+
+while.end14:                                      ; preds = %while.cond10
+  ret void
+}
+
+define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+  %conv = zext i8 %val to i32
+  %shl = shl nuw i32 %conv, 24
+  %shl2 = shl nuw nsw i32 %conv, 16
+  %or = or i32 %shl, %shl2
+  %shl4 = shl nuw nsw i32 %conv, 8
+  %or5 = or i32 %or, %shl4
+  %or7 = or i32 %or5, %conv
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond10, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0
+  %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)*
+  store i32 %or7, i32 addrspace(1)* %0, align 4
+  br label %while.cond
+
+while.cond10:                                     ; preds = %while.cond, %while.body13
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
+  %cmp11 = icmp ult i32 %index.1, %size
+  br i1 %cmp11, label %while.body13, label %while.end14
+
+while.body13:                                     ; preds = %while.cond10
+  %arrayidx = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1
+  store i8 %val, i8 addrspace(1)* %arrayidx, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond10
+
+while.end14:                                      ; preds = %while.cond10
+  ret void
+}
+
+define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+  %conv = zext i8 %val to i32
+  %shl = shl nuw i32 %conv, 24
+  %shl2 = shl nuw nsw i32 %conv, 16
+  %or = or i32 %shl, %shl2
+  %shl4 = shl nuw nsw i32 %conv, 8
+  %or5 = or i32 %or, %shl4
+  %or7 = or i32 %or5, %conv
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  %add = add i32 %index.0, 4
+  %cmp = icmp ult i32 %add, %size
+  br i1 %cmp, label %while.cond10, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %add.ptr = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0
+  %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)*
+  store i32 %or7, i32 addrspace(3)* %0, align 4
+  br label %while.cond
+
+while.cond10:                                     ; preds = %while.cond, %while.body13
+  %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ]
+  %cmp11 = icmp ult i32 %index.1, %size
+  br i1 %cmp11, label %while.body13, label %while.end14
+
+while.body13:                                     ; preds = %while.cond10
+  %arrayidx = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1
+  store i8 %val, i8 addrspace(3)* %arrayidx, align 1
+  %inc = add i32 %index.1, 1
+  br label %while.cond10
+
+while.end14:                                      ; preds = %while.cond10
+  ret void
+}
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
new file mode 100755
index 0000000..f648a8c
--- /dev/null
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -0,0 +1,5160 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GEN_OCL_STDLIB_H__
+#define __GEN_OCL_STDLIB_H__
+
+#define INLINE inline __attribute__((always_inline))
+#define OVERLOADABLE __attribute__((overloadable))
+#define PURE __attribute__((pure))
+#define CONST __attribute__((const))
+#define INLINE_OVERLOADABLE inline __attribute__((overloadable,always_inline))
+// FIXME, clang's opencl FE doesn't support static.
+#define static
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL built-in scalar data types
+/////////////////////////////////////////////////////////////////////////////
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+typedef __typeof__(sizeof(int)) size_t;
+typedef __typeof__((int *)0-(int *)0) ptrdiff_t;
+typedef signed int intptr_t;
+typedef unsigned int uintptr_t;
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL address space
+/////////////////////////////////////////////////////////////////////////////
+// These are built-ins in LLVM 3.3.
+#if 100*__clang_major__ + __clang_minor__ <= 302
+#define __private __attribute__((address_space(0)))
+#define __global __attribute__((address_space(1)))
+#define __constant __attribute__((address_space(2)))
+#define __local __attribute__((address_space(3)))
+#define global __global
+#define local __local
+#define constant __constant
+#define private __private
+#endif
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL built-in vector data types
+/////////////////////////////////////////////////////////////////////////////
+#define DEF(type) typedef type type##2 __attribute__((ext_vector_type(2)));\
+                  typedef type type##3 __attribute__((ext_vector_type(3)));\
+                  typedef type type##4 __attribute__((ext_vector_type(4)));\
+                  typedef type type##8 __attribute__((ext_vector_type(8)));\
+                  typedef type type##16 __attribute__((ext_vector_type(16)));
+DEF(char);
+DEF(uchar);
+DEF(short);
+DEF(ushort);
+DEF(int);
+DEF(uint);
+DEF(long);
+DEF(ulong);
+DEF(float);
+DEF(double);
+#undef DEF
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL other built-in data types
+/////////////////////////////////////////////////////////////////////////////
+// FIXME:
+// This is a transitional hack to bypass the LLVM 3.3 built-in types.
+// See the Khronos SPIR specification for handling of these types.
+#define __texture __attribute__((address_space(4)))
+struct _image1d_t;
+typedef __texture struct _image1d_t* __image1d_t;
+struct _image1d_buffer_t;
+typedef __texture struct _image1d_buffer_t* __image1d_buffer_t;
+struct _image1d_array_t;
+typedef __texture struct _image1d_array_t* __image1d_array_t;
+struct _image2d_t;
+typedef __texture struct _image2d_t* __image2d_t;
+struct _image2d_array_t;
+typedef __texture struct _image2d_array_t* __image2d_array_t;
+struct _image3d_t;
+typedef __texture struct _image3d_t* __image3d_t;
+typedef const ushort __sampler_t;
+typedef size_t __event_t;
+#define image1d_t __image1d_t
+#define image1d_buffer_t __image1d_buffer_t
+#define image1d_array_t __image1d_array_t
+#define image2d_t __image2d_t
+#define image2d_array_t __image2d_array_t
+#define image3d_t __image3d_t
+#define sampler_t __sampler_t
+#define event_t __event_t
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL conversions & type casting
+/////////////////////////////////////////////////////////////////////////////
+
+// ##BEGIN_AS##
+
+// ##END_AS##
+
+// ##BEGIN_CONVERT##
+
+// ##END_CONVERT##
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL preprocessor directives & macros
+/////////////////////////////////////////////////////////////////////////////
+#define __OPENCL_VERSION__ 120
+#define __CL_VERSION_1_0__ 100
+#define __CL_VERSION_1_1__ 110
+#define __CL_VERSION_1_2__ 120
+
+#define __ENDIAN_LITTLE__ 1
+#define __IMAGE_SUPPORT__ 1
+#define __kernel_exec(X, TYPE) __kernel __attribute__((work_group_size_hint(X,1,1))) \
+                                        __attribute__((vec_type_hint(TYPE)))
+#define kernel_exec(X, TYPE) __kernel_exec(X, TYPE)
+#define cl_khr_global_int32_base_atomics
+#define cl_khr_global_int32_extended_atomics
+#define cl_khr_local_int32_base_atomics
+#define cl_khr_local_int32_extended_atomics
+#define cl_khr_byte_addressable_store
+#define cl_khr_icd
+#define cl_khr_gl_sharing
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL floating-point macros and pragmas
+/////////////////////////////////////////////////////////////////////////////
+#define FLT_DIG 6
+#define FLT_MANT_DIG 24
+#define FLT_MAX_10_EXP +38
+#define FLT_MAX_EXP +128
+#define FLT_MIN_10_EXP -37
+#define FLT_MIN_EXP -125
+#define FLT_RADIX 2
+#define FLT_ONE 1.0000000000e+00         /* 0x3F800000 */
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define FLT_EPSILON 0x1.0p-23f
+
+#define MAXFLOAT     3.40282347e38F
+INLINE_OVERLOADABLE float __ocl_inff(void) {
+  union { uint u; float f; } u;
+  u.u = 0x7F800000;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __ocl_nanf(void) {
+  union { uint u; float f; } u;
+  u.u = 0x7F800001;
+  return u.f;
+}
+typedef union
+{
+  float value;
+  uint  word;
+} float_shape_type;
+
+/* Get a 32 bit int from a float.  */
+#ifndef GEN_OCL_GET_FLOAT_WORD
+# define GEN_OCL_GET_FLOAT_WORD(i,d)  \
+do {                                  \
+  float_shape_type gf_u;              \
+  gf_u.value = (d);                   \
+  (i) = gf_u.word;                    \
+} while (0)
+#endif
+/* Set a float from a 32 bit int.  */
+#ifndef GEN_OCL_SET_FLOAT_WORD
+# define GEN_OCL_SET_FLOAT_WORD(d,i)  \
+do {                                  \
+  float_shape_type sf_u;              \
+  sf_u.word = (i);                    \
+  (d) = sf_u.value;                   \
+} while (0)
+#endif
+
+INLINE_OVERLOADABLE int __ocl_finitef (float x){
+  unsigned ix;
+  GEN_OCL_GET_FLOAT_WORD (ix, x);
+  return (ix & 0x7fffffff) < 0x7f800000;
+}
+
+#define HUGE_VALF    (__ocl_inff())
+#define INFINITY     (__ocl_inff())
+#define NAN          (__ocl_nanf())
+#define M_E_F        2.718281828459045F
+#define M_LOG2E_F    1.4426950408889634F
+#define M_LOG10E_F   0.43429448190325176F
+#define M_LN2_F      0.6931471805599453F
+#define M_LN10_F     2.302585092994046F
+#define M_PI_F       3.141592653589793F
+#define M_PI_2_F     1.5707963267948966F
+#define M_PI_4_F     0.7853981633974483F
+#define M_1_PI_F     0.3183098861837907F
+#define M_2_PI_F     0.6366197723675814F
+#define M_2_SQRTPI_F 1.1283791670955126F
+#define M_SQRT2_F    1.4142135623730951F
+#define M_SQRT1_2_F  0.7071067811865476F
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL integer built-in macros
+/////////////////////////////////////////////////////////////////////////////
+#define CHAR_BIT    8
+#define CHAR_MAX    SCHAR_MAX
+#define CHAR_MIN    SCHAR_MIN
+#define INT_MAX     2147483647
+#define INT_MIN     (-2147483647 - 1)
+#define LONG_MAX    0x7fffffffffffffffL
+#define LONG_MIN    (-0x7fffffffffffffffL - 1)
+#define SCHAR_MAX   127
+#define SCHAR_MIN   (-127 - 1)
+#define SHRT_MAX    32767
+#define SHRT_MIN    (-32767 - 1)
+#define UCHAR_MAX   255
+#define USHRT_MAX   65535
+#define UINT_MAX    0xffffffff
+#define ULONG_MAX   0xffffffffffffffffUL
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL relational built-in functions
+/////////////////////////////////////////////////////////////////////////////
+
+int INLINE_OVERLOADABLE isequal(float x, float y) { return x == y; }
+int INLINE_OVERLOADABLE isnotequal(float x, float y) { return x != y; }
+int INLINE_OVERLOADABLE isgreater(float x, float y) { return x > y; }
+int INLINE_OVERLOADABLE isgreaterequal(float x, float y) { return x >= y; }
+int INLINE_OVERLOADABLE isless(float x, float y) { return x < y; }
+int INLINE_OVERLOADABLE islessequal(float x, float y) { return x <= y; }
+int INLINE_OVERLOADABLE islessgreater(float x, float y) { return (x < y) || (x > y); }
+
+#define SDEF(TYPE)                                                              \
+OVERLOADABLE TYPE ocl_sadd_sat(TYPE x, TYPE y);                          \
+OVERLOADABLE TYPE ocl_ssub_sat(TYPE x, TYPE y);                          \
+INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_sadd_sat(x, y); } \
+INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_ssub_sat(x, y); }
+SDEF(char);
+SDEF(short);
+#undef SDEF
+OVERLOADABLE int ocl_sadd_sat(int x, int y);
+INLINE_OVERLOADABLE int add_sat(int x, int y) { return ocl_sadd_sat(x, y); }
+OVERLOADABLE int ocl_ssub_sat(int x, int y);
+INLINE_OVERLOADABLE int sub_sat(int x, int y) {
+  return (y == 0x80000000u) ? (ocl_sadd_sat(ocl_sadd_sat(0x7fffffff, x), 1)) : ocl_ssub_sat(x, y);
+}
+OVERLOADABLE long ocl_sadd_sat(long x, long y);
+INLINE_OVERLOADABLE long add_sat(long x, long y) {
+  union {long l; uint i[2];} ux, uy;
+  ux.l = x;
+  uy.l = y;
+  if((ux.i[1] ^ uy.i[1]) & 0x80000000u)
+    return x + y;
+  return ocl_sadd_sat(x, y);
+}
+OVERLOADABLE long ocl_ssub_sat(long x, long y);
+INLINE_OVERLOADABLE long sub_sat(long x, long y) {
+  union {long l; uint i[2];} ux, uy;
+  ux.l = x;
+  uy.l = y;
+  if((ux.i[1] ^ uy.i[1]) & 0x80000000u)
+    return ocl_ssub_sat(x, y);
+  return x - y;
+}
+#define UDEF(TYPE)                                                              \
+OVERLOADABLE TYPE ocl_uadd_sat(TYPE x, TYPE y);                          \
+OVERLOADABLE TYPE ocl_usub_sat(TYPE x, TYPE y);                          \
+INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_uadd_sat(x, y); } \
+INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_usub_sat(x, y); }
+UDEF(uchar);
+UDEF(ushort);
+UDEF(uint);
+UDEF(ulong);
+#undef UDEF
+
+INLINE_OVERLOADABLE int isfinite(float x) {
+  union { uint u; float f; } u;
+  u.f = x;
+  return (u.u & 0x7FFFFFFF) < 0x7F800000;
+}
+INLINE_OVERLOADABLE int isinf(float x) {
+  union { uint u; float f; } u;
+  u.f = x;
+  return (u.u & 0x7FFFFFFF) == 0x7F800000;
+}
+INLINE_OVERLOADABLE int isnan(float x) {
+  return x != x;
+}
+INLINE_OVERLOADABLE int isnormal(float x) {
+  union { uint u; float f; } u;
+  u.f = x;
+  u.u &= 0x7FFFFFFF;
+  return (u.u < 0x7F800000) && (u.u >= 0x800000);
+}
+INLINE_OVERLOADABLE int isordered(float x, float y) { return isequal(x, x) && isequal(y, y); }
+INLINE_OVERLOADABLE int isunordered(float x, float y) { return isnan(x) || isnan(y); }
+INLINE_OVERLOADABLE int signbit(float x) {
+  union { uint u; float f; } u;
+  u.f = x;
+  return u.u >> 31;
+}
+
+#define DEC1(type) INLINE_OVERLOADABLE int any(type a) { return a<0; }
+#define DEC2(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0; }
+#define DEC3(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0; }
+#define DEC4(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0; }
+#define DEC8(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0 || a.s4<0 || a.s5<0 || a.s6<0 || a.s7<0; }
+#define DEC16(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0 || a.s4<0 || a.s5<0 || a.s6<0 || a.s7<0 || a.s8<0 || a.s9<0 || a.sA<0 || a.sB<0 || a.sC<0 || a.sD<0 || a.sE<0 || a.sF<0; }
+DEC1(char);
+DEC1(short);
+DEC1(int);
+DEC1(long);
+#define DEC(n) DEC##n(char##n); DEC##n(short##n); DEC##n(int##n); DEC##n(long##n);
+DEC(2);
+DEC(3);
+DEC(4);
+DEC(8);
+DEC(16);
+#undef DEC
+#undef DEC1
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+#define DEC1(type) INLINE_OVERLOADABLE int all(type a) { return a<0; }
+#define DEC2(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0; }
+#define DEC3(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0; }
+#define DEC4(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0; }
+#define DEC8(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0 && a.s4<0 && a.s5<0 && a.s6<0 && a.s7<0; }
+#define DEC16(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0 && a.s4<0 && a.s5<0 && a.s6<0 && a.s7<0 && a.s8<0 && a.s9<0 && a.sA<0 && a.sB<0 && a.sC<0 && a.sD<0 && a.sE<0 && a.sF<0; }
+DEC1(char);
+DEC1(short);
+DEC1(int);
+DEC1(long);
+#define DEC(n) DEC##n(char##n); DEC##n(short##n); DEC##n(int##n); DEC##n(long##n);
+DEC(2);
+DEC(3);
+DEC(4);
+DEC(8);
+DEC(16);
+#undef DEC
+#undef DEC1
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+#define DEF(type) INLINE_OVERLOADABLE type bitselect(type a, type b, type c) { return (a & ~c) | (b & c); }
+DEF(char); DEF(uchar); DEF(short); DEF(ushort); DEF(int); DEF(uint)
+DEF(long); DEF(ulong)
+#undef DEF
+INLINE_OVERLOADABLE float bitselect(float a, float b, float c) {
+  return as_float(bitselect(as_int(a), as_int(b), as_int(c)));
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Integer built-in functions
+/////////////////////////////////////////////////////////////////////////////
+PURE CONST uint __gen_ocl_fbh(uint);
+PURE CONST uint __gen_ocl_fbl(uint);
+
+INLINE_OVERLOADABLE char clz(char x) {
+  if (x < 0)
+    return 0;
+  if (x == 0)
+    return 8;
+  return __gen_ocl_fbh(x) - 24;
+}
+
+INLINE_OVERLOADABLE uchar clz(uchar x) {
+  if (x == 0)
+    return 8;
+  return __gen_ocl_fbh(x) - 24;
+}
+
+INLINE_OVERLOADABLE short clz(short x) {
+  if (x < 0)
+    return 0;
+  if (x == 0)
+    return 16;
+  return __gen_ocl_fbh(x) - 16;
+}
+
+INLINE_OVERLOADABLE ushort clz(ushort x) {
+  if (x == 0)
+    return 16;
+  return __gen_ocl_fbh(x) - 16;
+}
+
+INLINE_OVERLOADABLE int clz(int x) {
+  if (x < 0)
+    return 0;
+  if (x == 0)
+    return 32;
+  return __gen_ocl_fbh(x);
+}
+
+INLINE_OVERLOADABLE uint clz(uint x) {
+  if (x == 0)
+    return 32;
+  return __gen_ocl_fbh(x);
+}
+
+INLINE_OVERLOADABLE long clz(long x) {
+  union { int i[2]; long x; } u;
+  u.x = x;
+  if (u.i[1] & 0x80000000u)
+    return 0;
+  if (u.i[1] == 0 && u.i[0] == 0)
+    return 64;
+  uint v = clz(u.i[1]);
+  if(v == 32)
+    v += clz(u.i[0]);
+  return v;
+}
+
+INLINE_OVERLOADABLE ulong clz(ulong x) {
+  if (x == 0)
+    return 64;
+  union { uint i[2]; ulong x; } u;
+  u.x = x;
+  uint v = clz(u.i[1]);
+  if(v == 32)
+    v += clz(u.i[0]);
+  return v;
+}
+
+OVERLOADABLE int __gen_ocl_mul_hi(int x, int y);
+OVERLOADABLE uint __gen_ocl_mul_hi(uint x, uint y);
+OVERLOADABLE long __gen_ocl_mul_hi(long x, long y);
+OVERLOADABLE ulong __gen_ocl_mul_hi(ulong x, ulong y);
+INLINE_OVERLOADABLE char mul_hi(char x, char y) { return (x * y) >> 8; }
+INLINE_OVERLOADABLE uchar mul_hi(uchar x, uchar y) { return (x * y) >> 8; }
+INLINE_OVERLOADABLE short mul_hi(short x, short y) { return (x * y) >> 16; }
+INLINE_OVERLOADABLE ushort mul_hi(ushort x, ushort y) { return (x * y) >> 16; }
+INLINE_OVERLOADABLE int mul_hi(int x, int y) { return __gen_ocl_mul_hi(x, y); }
+INLINE_OVERLOADABLE uint mul_hi(uint x, uint y) { return __gen_ocl_mul_hi(x, y); }
+INLINE_OVERLOADABLE long mul_hi(long x, long y) {
+  return __gen_ocl_mul_hi(x, y);
+}
+INLINE_OVERLOADABLE ulong mul_hi(ulong x, ulong y) {
+  return __gen_ocl_mul_hi(x, y);
+}
+
+#define DEF(type) INLINE_OVERLOADABLE type mad_hi(type a, type b, type c) { return mul_hi(a, b) + c; }
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(long)
+DEF(ulong)
+#undef DEF
+
+INLINE_OVERLOADABLE int mul24(int a, int b) { return ((a << 8) >> 8) * ((b << 8) >> 8); }
+INLINE_OVERLOADABLE uint mul24(uint a, uint b) { return (a & 0xFFFFFF) * (b & 0xFFFFFF); }
+
+INLINE_OVERLOADABLE int mad24(int a, int b, int c) { return mul24(a, b) + c; }
+INLINE_OVERLOADABLE uint mad24(uint a, uint b, uint c) { return mul24(a, b) + c; }
+
+INLINE_OVERLOADABLE char mad_sat(char a, char b, char c) {
+  int x = (int)a * (int)b + (int)c;
+  if (x > 127)
+    x = 127;
+  if (x < -128)
+    x = -128;
+  return x;
+}
+
+INLINE_OVERLOADABLE uchar mad_sat(uchar a, uchar b, uchar c) {
+  uint x = (uint)a * (uint)b + (uint)c;
+  if (x > 255)
+    x = 255;
+  return x;
+}
+
+INLINE_OVERLOADABLE short mad_sat(short a, short b, short c) {
+  int x = (int)a * (int)b + (int)c;
+  if (x > 32767)
+    x = 32767;
+  if (x < -32768)
+    x = -32768;
+  return x;
+}
+
+INLINE_OVERLOADABLE ushort mad_sat(ushort a, ushort b, ushort c) {
+  uint x = (uint)a * (uint)b + (uint)c;
+  if (x > 65535)
+    x = 65535;
+  return x;
+}
+
+INLINE_OVERLOADABLE int mad_sat(int a, int b, int c) {
+  long x = (long)a * (long)b + (long)c;
+  if (x > 0x7FFFFFFF)
+    x = 0x7FFFFFFF;
+  else if (x < -0x7FFFFFFF-1)
+    x = -0x7FFFFFFF-1;
+  return (int)x;
+}
+
+INLINE_OVERLOADABLE uint mad_sat(uint a, uint b, uint c) {
+  ulong x = (ulong)a * (ulong)b + (ulong)c;
+  if (x > 0xFFFFFFFFu)
+    x = 0xFFFFFFFFu;
+  return (uint)x;
+}
+
+OVERLOADABLE long __gen_ocl_mad_sat(long a, long b, long c);
+OVERLOADABLE ulong __gen_ocl_mad_sat(ulong a, ulong b, ulong c);
+
+INLINE_OVERLOADABLE long mad_sat(long a, long b, long c) {
+  return __gen_ocl_mad_sat(a, b, c);
+}
+
+INLINE_OVERLOADABLE ulong mad_sat(ulong a, ulong b, ulong c) {
+  return __gen_ocl_mad_sat(a, b, c);
+}
+
+INLINE_OVERLOADABLE uchar __rotate_left(uchar x, uchar y) { return (x << y) | (x >> (8 - y)); }
+INLINE_OVERLOADABLE char __rotate_left(char x, char y) { return __rotate_left((uchar)x, (uchar)y); }
+INLINE_OVERLOADABLE ushort __rotate_left(ushort x, ushort y) { return (x << y) | (x >> (16 - y)); }
+INLINE_OVERLOADABLE short __rotate_left(short x, short y) { return __rotate_left((ushort)x, (ushort)y); }
+INLINE_OVERLOADABLE uint __rotate_left(uint x, uint y) { return (x << y) | (x >> (32 - y)); }
+INLINE_OVERLOADABLE int __rotate_left(int x, int y) { return __rotate_left((uint)x, (uint)y); }
+INLINE_OVERLOADABLE ulong __rotate_left(ulong x, ulong y) { return (x << y) | (x >> (64 - y)); }
+INLINE_OVERLOADABLE long __rotate_left(long x, long y) { return __rotate_left((ulong)x, (ulong)y); }
+#define DEF(type, m) INLINE_OVERLOADABLE type rotate(type x, type y) { return __rotate_left(x, (type)(y & m)); }
+DEF(char, 7)
+DEF(uchar, 7)
+DEF(short, 15)
+DEF(ushort, 15)
+DEF(int, 31)
+DEF(uint, 31)
+DEF(long, 63)
+DEF(ulong, 63)
+#undef DEF
+
+OVERLOADABLE short __gen_ocl_upsample(short hi, short lo);
+OVERLOADABLE int __gen_ocl_upsample(int hi, int lo);
+OVERLOADABLE long __gen_ocl_upsample(long hi, long lo);
+INLINE_OVERLOADABLE short upsample(char hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
+INLINE_OVERLOADABLE ushort upsample(uchar hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
+INLINE_OVERLOADABLE int upsample(short hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
+INLINE_OVERLOADABLE uint upsample(ushort hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
+INLINE_OVERLOADABLE long upsample(int hi, uint lo) {
+  return __gen_ocl_upsample((long)hi, (long)lo);
+}
+INLINE_OVERLOADABLE ulong upsample(uint hi, uint lo) {
+  return __gen_ocl_upsample((long)hi, (long)lo);
+}
+
+OVERLOADABLE uint __gen_ocl_hadd(uint x, uint y);
+OVERLOADABLE uint __gen_ocl_rhadd(uint x, uint y);
+#define DEC DEF(char); DEF(uchar); DEF(short); DEF(ushort)
+#define DEF(type) INLINE_OVERLOADABLE type hadd(type x, type y) { return (x + y) >> 1; }
+DEC
+#undef DEF
+#define DEF(type) INLINE_OVERLOADABLE type rhadd(type x, type y) { return (x + y + 1) >> 1; }
+DEC
+#undef DEF
+#undef DEC
+INLINE_OVERLOADABLE int hadd(int x, int y) {
+  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+         ((x + y) >> 1) :
+         __gen_ocl_hadd((uint)x, (uint)y);
+}
+INLINE_OVERLOADABLE uint hadd(uint x, uint y) { return __gen_ocl_hadd(x, y); }
+INLINE_OVERLOADABLE int rhadd(int x, int y) {
+  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+         ((x + y + 1) >> 1) :
+         __gen_ocl_rhadd((uint)x, (uint)y);
+ }
+INLINE_OVERLOADABLE uint rhadd(uint x, uint y) { return __gen_ocl_rhadd(x, y); }
+OVERLOADABLE ulong __gen_ocl_hadd(ulong x, ulong y);
+OVERLOADABLE ulong __gen_ocl_rhadd(ulong x, ulong y);
+INLINE_OVERLOADABLE long hadd(long x, long y) {
+  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+         ((x + y) >> 1) :
+         __gen_ocl_hadd((ulong)x, (ulong)y);
+}
+INLINE_OVERLOADABLE ulong hadd(ulong x, ulong y) {
+  return __gen_ocl_hadd(x, y);
+}
+INLINE_OVERLOADABLE long rhadd(long x, long y) {
+  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+         ((x + y + 1) >> 1) :
+         __gen_ocl_rhadd((ulong)x, (ulong)y);
+}
+INLINE_OVERLOADABLE ulong rhadd(ulong x, ulong y) {
+  return __gen_ocl_rhadd(x, y);
+}
+
+int __gen_ocl_abs(int x);
+#define DEC(TYPE) INLINE_OVERLOADABLE u##TYPE abs(TYPE x) { return (u##TYPE) __gen_ocl_abs(x); }
+DEC(int)
+DEC(short)
+DEC(char)
+#undef DEC
+INLINE_OVERLOADABLE ulong abs(long x) { return x < 0 ? -x : x; }
+/* For unsigned types, do nothing. */
+#define DEC(TYPE) INLINE_OVERLOADABLE TYPE abs(TYPE x) { return x; }
+DEC(uint)
+DEC(ushort)
+DEC(uchar)
+DEC(ulong)
+#undef DEC
+
+/* Char and short type abs diff */
+/* promote char and short to int and will be no module overflow */
+#define DEC(TYPE, UTYPE) INLINE_OVERLOADABLE UTYPE abs_diff(TYPE x, TYPE y) \
+                         { return (UTYPE) (abs((int)x - (int)y)); }
+DEC(char, uchar)
+DEC(uchar, uchar)
+DEC(short, ushort)
+DEC(ushort, ushort)
+#undef DEC
+
+INLINE_OVERLOADABLE uint abs_diff (uint x, uint y) {
+    /* same signed will never overflow. */
+    return y > x ? (y -x) : (x - y);
+}
+
+INLINE_OVERLOADABLE uint abs_diff (int x, int y) {
+    /* same signed will never module overflow. */
+    if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
+        return abs(x - y);
+
+    return (abs(x) + abs(y));
+}
+
+INLINE_OVERLOADABLE ulong abs_diff (long x, long y) {
+  if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
+    return abs(x - y);
+  return abs(x) + abs(y);
+}
+INLINE_OVERLOADABLE ulong abs_diff (ulong x, ulong y) {
+  return y > x ? (y - x) : (x - y);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////
+// SIMD level function
+/////////////////////////////////////////////////////////////////////////////
+short __gen_ocl_simd_any(short);
+short __gen_ocl_simd_all(short);
+
+
+/////////////////////////////////////////////////////////////////////////////
+// Work Items functions (see 6.11.1 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+
+PURE CONST uint __gen_ocl_get_work_dim(void);
+INLINE uint get_work_dim(void) {
+  return __gen_ocl_get_work_dim();
+}
+
+#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \
+PURE CONST unsigned int __gen_ocl_##NAME##0(void); \
+PURE CONST unsigned int __gen_ocl_##NAME##1(void); \
+PURE CONST unsigned int __gen_ocl_##NAME##2(void);
+DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
+DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
+DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
+DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
+DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
+DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
+#undef DECL_INTERNAL_WORK_ITEM_FN
+
+#define DECL_PUBLIC_WORK_ITEM_FN(NAME, OTHER_RET)    \
+INLINE unsigned NAME(unsigned int dim) {             \
+  if (dim == 0) return __gen_ocl_##NAME##0();        \
+  else if (dim == 1) return __gen_ocl_##NAME##1();   \
+  else if (dim == 2) return __gen_ocl_##NAME##2();   \
+  else return OTHER_RET;                             \
+}
+
+DECL_PUBLIC_WORK_ITEM_FN(get_group_id, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_local_id, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_local_size, 1)
+DECL_PUBLIC_WORK_ITEM_FN(get_global_size, 1)
+DECL_PUBLIC_WORK_ITEM_FN(get_global_offset, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
+#undef DECL_PUBLIC_WORK_ITEM_FN
+
+INLINE uint get_global_id(uint dim) {
+  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Math Functions (see 6.11.2 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+PURE CONST float __gen_ocl_fabs(float x);
+PURE CONST float __gen_ocl_sin(float x);
+PURE CONST float __gen_ocl_cos(float x);
+PURE CONST float __gen_ocl_sqrt(float x);
+PURE CONST float __gen_ocl_rsqrt(float x);
+PURE CONST float __gen_ocl_log(float x);
+PURE CONST float __gen_ocl_exp(float x);
+PURE CONST float __gen_ocl_pow(float x, float y);
+PURE CONST float __gen_ocl_rcp(float x);
+PURE CONST float __gen_ocl_rndz(float x);
+PURE CONST float __gen_ocl_rnde(float x);
+PURE CONST float __gen_ocl_rndu(float x);
+PURE CONST float __gen_ocl_rndd(float x);
+INLINE_OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
+  union { unsigned u; float f; } ux, uy;
+  ux.f = x;
+  uy.f = y;
+  ux.u = (ux.u & 0x7fffffff) | (uy.u & 0x80000000u);
+  return ux.f;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_log(float x) {
+/*
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  union { unsigned int i; float f; } u;
+  const float
+  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
+  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
+  two25 =    3.355443200e+07, /* 0x4c000000 */
+  Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
+  Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
+  Lg3 = 2.8571429849e-01, /* 3E924925 */
+  Lg4 = 2.2222198546e-01, /* 3E638E29 */
+  Lg5 = 1.8183572590e-01, /* 3E3A3325 */
+  Lg6 = 1.5313838422e-01, /* 3E1CD04F */
+  Lg7 = 1.4798198640e-01; /* 3E178897 */
+
+  const float zero   =  0.0;
+  float hfsq,f,s,z,R,w,t1,t2,dk;
+  int k,ix,i,j;
+
+  u.f = x;  ix = u.i;
+  k=0;
+  if (ix < 0x00800000) {      /* x < 2**-126  */
+      if ((ix&0x7fffffff)==0)
+    return -two25/zero;   /* log(+-0)=-inf */
+      if (ix<0) return (x-x)/zero;  /* log(-#) = NaN */
+      return -INFINITY;  /* Gen does not support subnormal number now */
+      //k -= 25; x *= two25; /* subnormal number, scale up x */
+      //u.f = x;  ix = u.i;
+  }
+  if (ix >= 0x7f800000) return x+x;
+  k += (ix>>23)-127;
+  ix &= 0x007fffff;
+  i = (ix+(0x95f64<<3))&0x800000;
+  u.i = ix|(i^0x3f800000); x = u.f;
+  k += (i>>23);
+  f = x-(float)1.0;
+  if((0x007fffff&(15+ix))<16) { /* |f| < 2**-20 */
+      if(f==zero) {
+        if(k==0) return zero;
+        else {
+          dk=(float)k; return dk*ln2_hi+dk*ln2_lo;
+        }
+      }
+      R = f*f*((float)0.5-(float)0.33333333333333333*f);
+      if(k==0)
+        return f-R;
+      else {
+        dk=(float)k;  return dk*ln2_hi-((R-dk*ln2_lo)-f);
+      }
+  }
+  s = f/((float)2.0+f);
+  dk = (float)k;
+  z = s*s;
+  i = ix-(0x6147a<<3);
+  w = z*z;
+  j = (0x6b851<<3)-ix;
+  t1= w*(Lg2+w*(Lg4+w*Lg6));
+  t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
+  i |= j;
+  R = t2+t1;
+  if(i>0) {
+      hfsq=(float)0.5*f*f;
+      if(k==0) return f-(hfsq-s*(hfsq+R)); else
+         return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
+  } else {
+      if(k==0) return f-s*(f-R); else
+         return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
+  }
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_log10(float x) {
+/*
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  union {float f; unsigned i; }u;
+  const float
+  zero       = 0.0,
+  two25      =  3.3554432000e+07, /* 0x4c000000 */
+  ivln10     =  4.3429449201e-01, /* 0x3ede5bd9 */
+  log10_2hi  =  3.0102920532e-01, /* 0x3e9a2080 */
+  log10_2lo  =  7.9034151668e-07; /* 0x355427db */
+
+  float y,z;
+  int i,k,hx;
+
+  u.f = x; hx = u.i;
+  k=0;
+  if (hx < 0x00800000) {                  /* x < 2**-126  */
+    if ((hx&0x7fffffff)==0)
+      return -two25/zero;             /* log(+-0)=-inf */
+    if (hx<0) return NAN;        /* log(-#) = NaN */
+    return -INFINITY;      /* Gen does not support subnormal now */
+    //k -= 25; x *= two25; /* subnormal number, scale up x */
+    //u.f = x; hx = u.i;
+  }
+  if (hx >= 0x7f800000) return x+x;
+  k += (hx>>23)-127;
+  i  = ((unsigned)k&0x80000000)>>31;
+  hx = (hx&0x007fffff)|((0x7f-i)<<23);
+  y  = (float)(k+i);
+  u.i = hx; x = u.f;
+  z  = y*log10_2lo + ivln10*__gen_ocl_internal_log(x);
+  return  z+y*log10_2hi;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_log2(float x) {
+/*
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ *  adapted for log2 by Ulrich Drepper <drepper at cygnus.com>
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  const float zero   =  0.0,
+  ln2 = 0.69314718055994530942,
+  two25 =    3.355443200e+07, /** 0x4c000000 */
+  Lg1 = 6.6666668653e-01, /** 3F2AAAAB */
+  Lg2 = 4.0000000596e-01, /** 3ECCCCCD */
+  Lg3 = 2.8571429849e-01, /** 3E924925 */
+  Lg4 = 2.2222198546e-01, /** 3E638E29 */
+  Lg5 = 1.8183572590e-01, /** 3E3A3325 */
+  Lg6 = 1.5313838422e-01, /** 3E1CD04F */
+  Lg7 = 1.4798198640e-01; /** 3E178897 */
+
+  float hfsq,f,s,z,R,w,t1,t2,dk;
+  int k,ix,i,j;
+
+  union {float f; int i; }u;//GET_FLOAT_WORD(ix,x);
+  u.f = x; ix = u.i;
+
+  k=0;
+  if (ix < 0x00800000) {           /** x < 2**-126  */
+      if ((ix&0x7fffffff)==0)
+      return -two25/(x-x);        /** log(+-0)=-inf */
+
+      if (ix<0) return (x-x)/(x-x);    /** log(-#) = NaN */
+      return -INFINITY;
+      k -= 25; x *= two25; /** subnormal number, scale up x */
+      u.f = x; ix = u.i; //GET_FLOAT_WORD(ix,x);
+  }
+
+  if (ix >= 0x7f800000) return x+x;
+
+  k += (ix>>23)-127;
+  ix &= 0x007fffff;
+  i = (ix+(0x95f64<<3))&0x800000;
+
+  u.i = ix|(i^0x3f800000); x = u.f;//SET_FLOAT_WORD(x,ix|(i^0x3f800000));    /** normalize x or x/2 */
+  k += (i>>23);
+  dk = (float)k;
+  f = x-(float)1.0;
+
+  if((0x007fffff&(15+ix))<16) {    /** |f| < 2**-20 */
+      if(f==zero) return dk;
+
+      R = f*f*((float)0.5-(float)0.33333333333333333*f);
+      return dk-(R-f)/ln2;
+  }
+
+  s = f/((float)2.0+f);
+  z = s*s;
+  i = ix-(0x6147a<<3);
+  w = z*z;
+  j = (0x6b851<<3)-ix;
+  t1= w*(Lg2+w*(Lg4+w*Lg6));
+  t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
+  i |= j;
+  R = t2+t1;
+
+  if(i>0) {
+      hfsq=(float)0.5*f*f;
+      return dk-((hfsq-(s*(hfsq+R)))-f)/ln2;
+  } else {
+      return dk-((s*(f-R))-f)/ln2;
+  }
+}
+
+INLINE float __gen_ocl_scalbnf (float x, int n){
+  /* copy from fdlibm */
+  float two25 = 3.355443200e+07,	/* 0x4c000000 */
+  twom25 = 2.9802322388e-08,	        /* 0x33000000 */
+  huge = 1.0e+30,
+  tiny = 1.0e-30;
+  int k,ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  k = (ix&0x7f800000)>>23; /* extract exponent */
+  if (k==0) {	/* 0 or subnormal x */
+    if ((ix&0x7fffffff)==0) return x; /* +-0 */
+    x *= two25;
+    GEN_OCL_GET_FLOAT_WORD(ix,x);
+    k = ((ix&0x7f800000)>>23) - 25;
+  }
+  if (k==0xff) return x+x;	/* NaN or Inf */
+  if (n< -50000)
+    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
+  if (n> 50000 || k+n > 0xfe)
+    return huge*__gen_ocl_internal_copysign(huge,x); /* overflow  */
+  /* Now k and n are bounded we know that k = k+n does not overflow. */
+  k = k+n;
+  if (k > 0) { /* normal result */
+    GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
+    return x;
+  }
+  if (k <= -25)
+    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
+  k += 25;				/* subnormal result */
+  GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
+  return x*twom25;
+}
+
+
+
+__constant const float PIo2[] = {
+  1.5703125000e+00, /* 0x3fc90000 */
+  4.5776367188e-04, /* 0x39f00000 */
+  2.5987625122e-05, /* 0x37da0000 */
+  7.5437128544e-08, /* 0x33a20000 */
+  6.0026650317e-11, /* 0x2e840000 */
+  7.3896444519e-13, /* 0x2b500000 */
+  5.3845816694e-15, /* 0x27c20000 */
+  5.6378512969e-18, /* 0x22d00000 */
+  8.3009228831e-20, /* 0x1fc40000 */
+  3.2756352257e-22, /* 0x1bc60000 */
+  6.3331015649e-25, /* 0x17440000 */
+};
+
+INLINE int __kernel_rem_pio2f(float *x, float *y, int e0, int nx, int prec, const __constant int *ipio2)
+{
+  /* copied from fdlibm */
+const float
+zero   = 0.0,
+one    = 1.0,
+two8   =  2.5600000000e+02, /* 0x43800000 */
+twon8  =  3.9062500000e-03; /* 0x3b800000 */
+
+  int init_jk[3]; /* initial value for jk */
+  int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
+  float z,fw,f[20],fq[20],q[20];
+  init_jk[0] = 4; init_jk[1] = 7; init_jk[2] = 9;
+    /* initialize jk*/
+  jk = init_jk[prec];
+  jp = jk;
+
+    /* determine jx,jv,q0, note that 3>q0 */
+  jx =  nx-1;
+  jv = (e0-3)/8; if(jv<0) jv=0;
+  q0 =  e0-8*(jv+1);
+
+    /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
+  j = jv-jx; m = jx+jk;
+  for(i=0;i<=m;i++,j++) f[i] = (j<0)? zero : (float) ipio2[j];
+
+    /* compute q[0],q[1],...q[jk] */
+  for (i=0;i<=jk;i++) {
+      for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
+  }
+
+  jz = jk;
+recompute:
+    /* distill q[] into iq[] reversingly */
+  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
+      fw    =  (float)((int)(twon8* z));
+      iq[i] =  (int)(z-two8*fw);
+      z     =  q[j-1]+fw;
+  }
+
+    /* compute n */
+  z  = __gen_ocl_scalbnf(z,q0);   /* actual value of z */
+  z -= (float)8.0*__gen_ocl_internal_floor(z*(float)0.125); /* trim off integer >= 8 */
+  n  = (int) z;
+  z -= (float)n;
+  ih = 0;
+  if(q0>0) {  /* need iq[jz-1] to determine n */
+      i  = (iq[jz-1]>>(8-q0)); n += i;
+      iq[jz-1] -= i<<(8-q0);
+      ih = iq[jz-1]>>(7-q0);
+  }
+  else if(q0==0) ih = iq[jz-1]>>8;
+  else if(z>=(float)0.5) ih=2;
+
+  if(ih>0) {  /* q > 0.5 */
+      n += 1; carry = 0;
+      for(i=0;i<jz ;i++) {  /* compute 1-q */
+    j = iq[i];
+    if(carry==0) {
+        if(j!=0) {
+      carry = 1; iq[i] = 0x100- j;
+        }
+    } else  iq[i] = 0xff - j;
+      }
+      if(q0>0) {    /* rare case: chance is 1 in 12 */
+          switch(q0) {
+          case 1:
+           iq[jz-1] &= 0x7f; break;
+        case 2:
+           iq[jz-1] &= 0x3f; break;
+          }
+      }
+      if(ih==2) {
+    z = one - z;
+    if(carry!=0) z -= __gen_ocl_scalbnf(one,q0);
+      }
+  }
+
+    /* check if recomputation is needed */
+  if(z==zero) {
+      j = 0;
+      for (i=jz-1;i>=jk;i--) j |= iq[i];
+      if(j==0) { /* need recomputation */
+    for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
+
+    for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
+        f[jx+i] = (float) ipio2[jv+i];
+        for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
+        q[i] = fw;
+    }
+    jz += k;
+    goto recompute;
+      }
+  }
+
+    /* chop off zero terms */
+  if(z==(float)0.0) {
+      jz -= 1; q0 -= 8;
+      while(iq[jz]==0) { jz--; q0-=8;}
+  } else { /* break z into 8-bit if necessary */
+      z = __gen_ocl_scalbnf(z,-q0);
+      if(z>=two8) {
+    fw = (float)((int)(twon8*z));
+    iq[jz] = (int)(z-two8*fw);
+    jz += 1; q0 += 8;
+    iq[jz] = (int) fw;
+      } else iq[jz] = (int) z ;
+  }
+
+    /* convert integer "bit" chunk to floating-point value */
+  fw = __gen_ocl_scalbnf(one,q0);
+  for(i=jz;i>=0;i--) {
+      q[i] = fw*(float)iq[i]; fw*=twon8;
+  }
+
+    /* compute PIo2[0,...,jp]*q[jz,...,0] */
+  for(i=jz;i>=0;i--) {
+      for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
+      fq[jz-i] = fw;
+  }
+
+    /* compress fq[] into y[] */
+  switch(prec) {
+      case 0:
+    fw = 0.0;
+    for (i=jz;i>=0;i--) fw += fq[i];
+    y[0] = (ih==0)? fw: -fw;
+    break;
+      case 1:
+      case 2:
+    fw = 0.0;
+    for (i=jz;i>=0;i--) fw += fq[i];
+    y[0] = (ih==0)? fw: -fw;
+    fw = fq[0]-fw;
+    for (i=1;i<=jz;i++) fw += fq[i];
+    y[1] = (ih==0)? fw: -fw;
+    break;
+      case 3: /* painful */
+    for (i=jz;i>0;i--) {
+        fw      = fq[i-1]+fq[i];
+        fq[i]  += fq[i-1]-fw;
+        fq[i-1] = fw;
+    }
+    for (i=jz;i>1;i--) {
+        fw      = fq[i-1]+fq[i];
+        fq[i]  += fq[i-1]-fw;
+        fq[i-1] = fw;
+    }
+    for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
+    if(ih==0) {
+        y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
+    } else {
+        y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
+    }
+  }
+  return n&7;
+
+}
+__constant const int npio2_hw[32] = {
+0x3fc90f00, 0x40490f00, 0x4096cb00, 0x40c90f00, 0x40fb5300, 0x4116cb00,
+0x412fed00, 0x41490f00, 0x41623100, 0x417b5300, 0x418a3a00, 0x4196cb00,
+0x41a35c00, 0x41afed00, 0x41bc7e00, 0x41c90f00, 0x41d5a000, 0x41e23100,
+0x41eec200, 0x41fb5300, 0x4203f200, 0x420a3a00, 0x42108300, 0x4216cb00,
+0x421d1400, 0x42235c00, 0x4229a500, 0x422fed00, 0x42363600, 0x423c7e00,
+0x4242c700, 0x42490f00
+};
+
+__constant const int two_over_pi[22*9] = {
+0xA2, 0xF9, 0x83, 0x6E, 0x4E, 0x44, 0x15, 0x29, 0xFC,
+0x27, 0x57, 0xD1, 0xF5, 0x34, 0xDD, 0xC0, 0xDB, 0x62,
+0x95, 0x99, 0x3C, 0x43, 0x90, 0x41, 0xFE, 0x51, 0x63,
+0xAB, 0xDE, 0xBB, 0xC5, 0x61, 0xB7, 0x24, 0x6E, 0x3A,
+0x42, 0x4D, 0xD2, 0xE0, 0x06, 0x49, 0x2E, 0xEA, 0x09,
+0xD1, 0x92, 0x1C, 0xFE, 0x1D, 0xEB, 0x1C, 0xB1, 0x29,
+0xA7, 0x3E, 0xE8, 0x82, 0x35, 0xF5, 0x2E, 0xBB, 0x44,
+0x84, 0xE9, 0x9C, 0x70, 0x26, 0xB4, 0x5F, 0x7E, 0x41,
+0x39, 0x91, 0xD6, 0x39, 0x83, 0x53, 0x39, 0xF4, 0x9C,
+0x84, 0x5F, 0x8B, 0xBD, 0xF9, 0x28, 0x3B, 0x1F, 0xF8,
+0x97, 0xFF, 0xDE, 0x05, 0x98, 0x0F, 0xEF, 0x2F, 0x11,
+0x8B, 0x5A, 0x0A, 0x6D, 0x1F, 0x6D, 0x36, 0x7E, 0xCF,
+0x27, 0xCB, 0x09, 0xB7, 0x4F, 0x46, 0x3F, 0x66, 0x9E,
+0x5F, 0xEA, 0x2D, 0x75, 0x27, 0xBA, 0xC7, 0xEB, 0xE5,
+0xF1, 0x7B, 0x3D, 0x07, 0x39, 0xF7, 0x8A, 0x52, 0x92,
+0xEA, 0x6B, 0xFB, 0x5F, 0xB1, 0x1F, 0x8D, 0x5D, 0x08,
+0x56, 0x03, 0x30, 0x46, 0xFC, 0x7B, 0x6B, 0xAB, 0xF0,
+0xCF, 0xBC, 0x20, 0x9A, 0xF4, 0x36, 0x1D, 0xA9, 0xE3,
+0x91, 0x61, 0x5E, 0xE6, 0x1B, 0x08, 0x65, 0x99, 0x85,
+0x5F, 0x14, 0xA0, 0x68, 0x40, 0x8D, 0xFF, 0xD8, 0x80,
+0x4D, 0x73, 0x27, 0x31, 0x06, 0x06, 0x15, 0x56, 0xCA,
+0x73, 0xA8, 0xC9, 0x60, 0xE2, 0x7B, 0xC0, 0x8C, 0x6B,
+};
+
+
+
+INLINE int __ieee754_rem_pio2f(float x, float *y) {
+  /* copied from fdlibm */
+  float z,w,t,r,fn;
+  float tx[3];
+
+const float half_value = 5.0000000e-1;
+const float zero =  0.0000000000;
+const float two8 =  2.5600000000e+02;
+const float invpio2 =  6.3661980629e-01;
+const float pio2_1  =  1.5707855225e+00;
+const float pio2_1t =  1.0804334124e-05;
+const float pio2_2  =  1.0804273188e-05;
+const float pio2_2t =  6.0770999344e-11;
+const float pio2_3  =  6.0770943833e-11;
+const float pio2_3t =  6.1232342629e-17;
+  int e0,i,j,nx,n,ix,hx;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  if(ix<=0x3f490fd8)   /* |x| ~<= pi/4 , no need for reduction */
+      {y[0] = x; y[1] = 0; return 0;}
+  if(ix<0x4016cbe4) {  /* |x| < 3pi/4, special case with n=+-1 */
+      if(hx>0) {
+    z = x - pio2_1;
+    if((ix&0xfffffff0)!=0x3fc90fd0) { /* 24+24 bit pi OK */
+        y[0] = z - pio2_1t;
+        y[1] = (z-y[0])-pio2_1t;
+    } else {    /* near pi/2, use 24+24+24 bit pi */
+        z -= pio2_2;
+        y[0] = z - pio2_2t;
+        y[1] = (z-y[0])-pio2_2t;
+    }
+    return 1;
+      } else {  /* negative x */
+    z = x + pio2_1;
+    if((ix&0xfffffff0)!=0x3fc90fd0) { /* 24+24 bit pi OK */
+        y[0] = z + pio2_1t;
+        y[1] = (z-y[0])+pio2_1t;
+    } else {    /* near pi/2, use 24+24+24 bit pi */
+        z += pio2_2;
+        y[0] = z + pio2_2t;
+        y[1] = (z-y[0])+pio2_2t;
+    }
+    return -1;
+      }
+  }
+  if(ix<=0x43490f80) { /* |x| ~<= 2^7*(pi/2), medium size */
+      t  = __gen_ocl_fabs(x);
+      n  = (int) (t*invpio2+half_value);
+      fn = (float)n;
+      r  = t-fn*pio2_1;
+      w  = fn*pio2_1t;  /* 1st round good to 40 bit */
+      if(n<32&&(ix&0xffffff00)!=npio2_hw[n-1]) {
+    y[0] = r-w; /* quick check no cancellation */
+      } else {
+          uint high;
+          j  = ix>>23;
+          y[0] = r-w;
+    GEN_OCL_GET_FLOAT_WORD(high,y[0]);
+          i = j-((high>>23)&0xff);
+          if(i>8) {  /* 2nd iteration needed, good to 57 */
+        t  = r;
+        w  = fn*pio2_2;
+        r  = t-w;
+        w  = fn*pio2_2t-((t-r)-w);
+        y[0] = r-w;
+        GEN_OCL_GET_FLOAT_WORD(high,y[0]);
+        i = j-((high>>23)&0xff);
+        if(i>25)  { /* 3rd iteration need, 74 bits acc */
+          t  = r; /* will cover all possible cases */
+          w  = fn*pio2_3;
+          r  = t-w;
+          w  = fn*pio2_3t-((t-r)-w);
+          y[0] = r-w;
+        }
+    }
+      }
+      y[1] = (r-y[0])-w;
+      if(hx<0)  {y[0] = -y[0]; y[1] = -y[1]; return -n;}
+      else   return n;
+  }
+    /*
+     * all other (large) arguments
+     */
+  if(ix>=0x7f800000) {    /* x is inf or NaN */
+      y[0]=y[1]=x-x; return 0;
+  }
+    /* set z = scalbn(|x|,ilogb(x)-7) */
+  e0  = (ix>>23)-134;   /* e0 = ilogb(z)-7; */
+  GEN_OCL_SET_FLOAT_WORD(z, ix - ((int)(e0<<23)));
+  for(i=0;i<2;i++) {
+    tx[i] = (float)((int)(z));
+    z     = (z-tx[i])*two8;
+  }
+  tx[2] = z;
+  nx = 3;
+  while(tx[nx-1]==zero) nx--; /* skip zero term */
+  n  =  __kernel_rem_pio2f(tx,y,e0,nx,2,two_over_pi);
+  if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
+  return n;
+}
+
+INLINE_OVERLOADABLE float __kernel_sinf(float x, float y, int iy)
+{
+  /* copied from fdlibm */
+const float
+half_value =  5.0000000000e-01,/* 0x3f000000 */
+S1  = -1.6666667163e-01, /* 0xbe2aaaab */
+S2  =  8.3333337680e-03, /* 0x3c088889 */
+S3  = -1.9841270114e-04, /* 0xb9500d01 */
+S4  =  2.7557314297e-06, /* 0x3638ef1b */
+S5  = -2.5050759689e-08, /* 0xb2d72f34 */
+S6  =  1.5896910177e-10; /* 0x2f2ec9d3 */
+  float z,r,v;
+  int ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  ix &= 0x7fffffff;     /* high word of x */
+  if(ix<0x32000000)     /* |x| < 2**-27 */
+     {if((int)x==0) return x;}    /* generate inexact */
+  z =  x*x;
+  v =  z*x;
+  r =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
+  if(iy==0) return x+v*(S1+z*r);
+  else      return x-((z*(half_value*y-v*r)-y)-v*S1);
+}
+
+INLINE  float __kernel_cosf(float x, float y)
+{
+  /* copied from fdlibm */
+  const float
+  one =  1.0000000000e+00, /* 0x3f800000 */
+  C1  =  4.1666667908e-02, /* 0x3d2aaaab */
+  C2  = -1.3888889225e-03, /* 0xbab60b61 */
+  C3  =  2.4801587642e-05, /* 0x37d00d01 */
+  C4  = -2.7557314297e-07, /* 0xb493f27c */
+  C5  =  2.0875723372e-09, /* 0x310f74f6 */
+  C6  = -1.1359647598e-11; /* 0xad47d74e */
+  const float pio2_hi = 0x1.92p0, pio2_mid = 0x1.fb4p-12, pio2_low = 0x1.4442d2p-24;
+  float a,hz,z,r,qx;
+  int ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  ix &= 0x7fffffff;     /* ix = |x|'s high word*/
+  if(ix<0x32000000) {     /* if x < 2**27 */
+      if(((int)x)==0) return one;   /* generate inexact */
+  }
+
+  if(x < 0.0f) { x= -x; y = -y; }
+  if(ix > 0x3f490fdb) { /* |x|>pi/4*/
+    return -__kernel_sinf(x-pio2_hi-pio2_mid-pio2_low, y, 1);
+  }
+  z  = x*x;
+  r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
+  if(ix < 0x3e99999a)       /* if |x| < 0.3 */
+      return one - ((float)0.5*z - (z*r - x*y));
+  else {
+      GEN_OCL_SET_FLOAT_WORD(qx,ix-0x01000000); /* x/4 */
+      hz = (float)0.5*z-qx;
+      a  = one-qx;
+      return a - (hz - (z*r-x*y));
+  }
+}
+
+INLINE_OVERLOADABLE  float sin(float x) {
+  /* copied from fdlibm */
+  float y[2],z=0.0;
+  int n, ix;
+
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+    /* |x| ~< pi/4 */
+  ix &= 0x7fffffff;
+  if(ix <= 0x3f490fd8) return __kernel_sinf(x,z,0);
+
+    /* sin(Inf or NaN) is NaN */
+  else if (ix>=0x7f800000) return x-x;
+
+    /* argument reduction needed */
+  else {
+      n = __ieee754_rem_pio2f(x,y);
+      switch(n&3) {
+    case 0: return  __kernel_sinf(y[0],y[1],1);
+    case 1: return  __kernel_cosf(y[0],y[1]);
+    case 2: return -__kernel_sinf(y[0],y[1],1);
+    default:
+      return -__kernel_cosf(y[0],y[1]);
+      }
+  }
+}
+INLINE_OVERLOADABLE  float cos(float x) {
+  /* copied from fdlibm */
+  float y[2],z=0.0;
+  int n, ix;
+
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+    /* |x| ~< pi/4 */
+  ix &= 0x7fffffff;
+  if(ix <= 0x3f490fd8) return __kernel_cosf(x,z);
+
+    /* cos(Inf or NaN) is NaN */
+  else if (ix>=0x7f800000) return x-x;
+
+    /* argument reduction needed */
+  else {
+      n = __ieee754_rem_pio2f(x,y);
+      switch(n&3) {
+    case 0: return  __kernel_cosf(y[0],y[1]);
+    case 1: return  -__kernel_sinf(y[0],y[1],1);
+    case 2: return -__kernel_cosf(y[0],y[1]);
+    default:
+      return __kernel_sinf(y[0],y[1],1);
+      }
+  }
+}
+
+INLINE float __kernel_tanf(float x, float y, int iy)
+{
+  /* copied from fdlibm */
+        float z,r,v,w,s;
+        int ix,hx;
+        const float
+        one   =  1.0000000000e+00, /* 0x3f800000 */
+        pio4  =  7.8539812565e-01, /* 0x3f490fda */
+        pio4lo=  3.7748947079e-08; /* 0x33222168 */
+        float T[13];// =  {
+         T[0] = 3.3333334327e-01; /* 0x3eaaaaab */
+         T[1] = 1.3333334029e-01; /* 0x3e088889 */
+         T[2] = 5.3968254477e-02; /* 0x3d5d0dd1 */
+         T[3] = 2.1869488060e-02; /* 0x3cb327a4 */
+         T[4] = 8.8632395491e-03; /* 0x3c11371f */
+         T[5] = 3.5920790397e-03; /* 0x3b6b6916 */
+         T[6] = 1.4562094584e-03; /* 0x3abede48 */
+         T[7] = 5.8804126456e-04; /* 0x3a1a26c8 */
+         T[8] = 2.4646313977e-04; /* 0x398137b9 */
+         T[9] = 7.8179444245e-05; /* 0x38a3f445 */
+         T[10] = 7.1407252108e-05; /* 0x3895c07a */
+         T[11] = -1.8558637748e-05; /* 0xb79bae5f */
+         T[12] = 2.5907305826e-05; /* 0x37d95384 */
+
+
+        GEN_OCL_GET_FLOAT_WORD(hx,x);
+        ix = hx&0x7fffffff;     /* high word of |x| */
+        if(ix<0x31800000)                       /* x < 2**-28 */
+            {if((int)x==0) {                    /* generate inexact */
+                if((ix|(iy+1))==0) return one/__gen_ocl_fabs(x);
+                else return (iy==1)? x: -one/x;
+            }
+            }
+        if(ix>=0x3f2ca140) {                    /* |x|>=0.6744 */
+            if(hx<0) {x = -x; y = -y;}
+
+
+            z = pio4-x;
+            w = pio4lo-y;
+            x = z+w; y = 0.0;
+        }
+        z       =  x*x;
+        w       =  z*z;
+    /* Break x^5*(T[1]+x^2*T[2]+...) into
+     *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
+     *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
+     */
+        r = T[1]+w*(T[3]+w*(T[5]+w*(T[7]+w*(T[9]+w*T[11]))));
+        v = z*(T[2]+w*(T[4]+w*(T[6]+w*(T[8]+w*(T[10]+w*T[12])))));
+        s = z*x;
+        r = y + z*(s*(r+v)+y);
+        r += T[0]*s;
+        w = x+r;
+        if(ix>=0x3f2ca140) {
+            v = (float)iy;
+            return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
+        }
+        if(iy==1) return w;
+        else {          /* if allow error up to 2 ulp
+                           simply return -1.0/(x+r) here */
+     /*  compute -1.0/(x+r) accurately */
+            float a,t;
+            int i;
+            z  = w;
+            GEN_OCL_GET_FLOAT_WORD(i,z);
+            GEN_OCL_SET_FLOAT_WORD(z,i&0xfffff000);
+            v  = r-(z - x);     /* z+v = r+x */
+            t = a  = -(float)1.0/w;     /* a = -1.0/w */
+            GEN_OCL_GET_FLOAT_WORD(i,t);
+            GEN_OCL_SET_FLOAT_WORD(t,i&0xfffff000);
+            s  = (float)1.0+t*z;
+            return t+a*(s+t*v);
+        }
+}
+
+INLINE_OVERLOADABLE float tan(float x)
+{
+  /* copied from fdlibm */
+        const float pio2_hi = 0x1.92p-0, pio2_mid = 0x1.fb4p-12, pio2_low = 0x1.4442d2p-24;
+        const float pio4  =  7.8539812565e-01;
+        float y[2],z=0.0;
+        int n, ix;
+
+        GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+    /* |x| ~< pi/4 */
+        ix &= 0x7fffffff;
+        if(ix <= 0x3f490fda) return __kernel_tanf(x,z,1);
+
+    /* tan(Inf or NaN) is NaN */
+        else if (ix>=0x7f800000) return x-x;            /* NaN */
+
+    /* argument reduction needed */
+      else {
+        n = __ieee754_rem_pio2f(x,y);
+
+        x = y[0];
+        float m = y[1];
+        int iy = 1-((n&1)<<1);
+        GEN_OCL_GET_FLOAT_WORD(ix,x);
+        float sign = 1.0f;
+        if(ix < 0) {
+          x = -x; m = -m;
+          sign = -1.0f;
+        }
+
+        if(x > pio4) {/* reduce x to less than pi/4 through (pi/2-x) */
+          float t = __kernel_tanf(pio2_hi-x+pio2_mid+pio2_low, -m, 1);
+          if(iy == -1) return sign*(-t); else return sign*1/t;
+        } else
+            return __kernel_tanf(y[0],y[1],1-((n&1)<<1)); /*   1 -- n even
+                                                              -1 -- n odd */
+      }
+}
+
+INLINE_OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
+  int ix;
+  if(isinf(x) || isnan(x)) { return NAN; }
+  if(x < 0.0f) { x = -x; }
+  GEN_OCL_GET_FLOAT_WORD(ix, x);
+  if(x> 0x1.0p24) return 1.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  if((ix&0x1) != 0) m+=1.0f;
+    ix = __gen_ocl_internal_floor(m*4.0f);
+
+  switch(ix) {
+   case 0:
+    return __kernel_cosf(m*M_PI_F, 0.0f);
+   case 1:
+   case 2:
+    return __kernel_sinf((0.5f-m)*M_PI_F, 0.0f, 0);
+   case 3:
+   case 4:
+    return -__kernel_cosf((m-1.0f)*M_PI_F, 0.0f);
+   case 5:
+   case 6:
+    return __kernel_sinf((m-1.5f)*M_PI_F, 0.0f, 0);
+   default:
+    return __kernel_cosf((2.0f-m)*M_PI_F, 0.0f);
+   }
+}
+INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
+  float sign = 1.0f;
+  int ix;
+  if(isinf(x)) return NAN;
+  if(x < 0.0f) { x = -x; sign = -1.0f; }
+  GEN_OCL_GET_FLOAT_WORD(ix, x);
+  if(x> 0x1.0p24) return 0.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  if((ix&0x1) != 0) m+=1.0f;
+    ix = __gen_ocl_internal_floor(m*4.0f);
+
+  switch(ix) {
+   case 0:
+    return sign*__kernel_sinf(m*M_PI_F, 0.0f, 0);
+   case 1:
+   case 2:
+    return sign*__kernel_cosf((m-0.5f)*M_PI_F, 0.0f);
+   case 3:
+   case 4:
+    return -sign*__kernel_sinf((m-1.0f)*M_PI_F, 0.0f, 0);
+   case 5:
+   case 6:
+    return -sign*__kernel_cosf((m-1.5f)*M_PI_F, 0.0f);
+   default:
+    return -sign*__kernel_sinf((2.0f-m)*M_PI_F, 0.0f, 0);
+   }
+
+}
+INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
+INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
+INLINE_OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
+INLINE_OVERLOADABLE float native_log(float x) {
+  return native_log2(x) * 0.6931472002f;
+}
+INLINE_OVERLOADABLE float tgamma(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  float pi = 3.1415927410e+00,
+    a0 = 7.7215664089e-02,
+    a1 = 3.2246702909e-01,
+    a2 = 6.7352302372e-02,
+    a3 = 2.0580807701e-02,
+    a4 = 7.3855509982e-03,
+    a5 = 2.8905137442e-03,
+    a6 = 1.1927076848e-03,
+    a7 = 5.1006977446e-04,
+    a8 = 2.2086278477e-04,
+    a9 = 1.0801156895e-04,
+    a10 = 2.5214456400e-05,
+    a11 = 4.4864096708e-05,
+    tc = 1.4616321325e+00,
+    tf = -1.2148628384e-01,
+    tt = 6.6971006518e-09,
+    t0 = 4.8383611441e-01,
+    t1 = -1.4758771658e-01,
+    t2 = 6.4624942839e-02,
+    t3 = -3.2788541168e-02,
+    t4 = 1.7970675603e-02,
+    t5 = -1.0314224288e-02,
+    t6 = 6.1005386524e-03,
+    t7 = -3.6845202558e-03,
+    t8 = 2.2596477065e-03,
+    t9 = -1.4034647029e-03,
+    t10 = 8.8108185446e-04,
+    t11 = -5.3859531181e-04,
+    t12 = 3.1563205994e-04,
+    t13 = -3.1275415677e-04,
+    t14 = 3.3552918467e-04,
+    u0 = -7.7215664089e-02,
+    u1 = 6.3282704353e-01,
+    u2 = 1.4549225569e+00,
+    u3 = 9.7771751881e-01,
+    u4 = 2.2896373272e-01,
+    u5 = 1.3381091878e-02,
+    v1 = 2.4559779167e+00,
+    v2 = 2.1284897327e+00,
+    v3 = 7.6928514242e-01,
+    v4 = 1.0422264785e-01,
+    v5 = 3.2170924824e-03,
+    s0 = -7.7215664089e-02,
+    s1 = 2.1498242021e-01,
+    s2 = 3.2577878237e-01,
+    s3 = 1.4635047317e-01,
+    s4 = 2.6642270386e-02,
+    s5 = 1.8402845599e-03,
+    s6 = 3.1947532989e-05,
+    r1 = 1.3920053244e+00,
+    r2 = 7.2193557024e-01,
+    r3 = 1.7193385959e-01,
+    r4 = 1.8645919859e-02,
+    r5 = 7.7794247773e-04,
+    r6 = 7.3266842264e-06,
+    w0 = 4.1893854737e-01,
+    w1 = 8.3333335817e-02,
+    w2 = -2.7777778450e-03,
+    w3 = 7.9365057172e-04,
+    w4 = -5.9518753551e-04,
+    w5 = 8.3633989561e-04,
+    w6 = -1.6309292987e-03;
+  float t, y, z, nadj, p, p1, p2, p3, q, r, w;
+  int i, hx, ix;
+  nadj = 0;
+  hx = *(int *) (&x);
+  ix = hx & 0x7fffffff;
+  if (ix >= 0x7f800000)
+    return x * x;
+  if (ix == 0)
+    return INFINITY;
+  if (ix < 0x1c800000) {
+    if (hx < 0) {
+      return - native_log(-x);
+    } else
+      return - native_log(x);
+  }
+  if (hx < 0) {
+    if (ix >= 0x4b000000)
+      return INFINITY;
+    t = __gen_ocl_internal_sinpi(x);
+    if (__gen_ocl_fabs(t) < 1e-8f)
+      return INFINITY;
+    nadj = native_log(M_PI_F / __gen_ocl_fabs(t * x));
+    x = -x;
+  }
+
+  if (ix == 0x3f800000 || ix == 0x40000000)
+    r = 0;
+  else if (ix < 0x40000000) {
+    if (ix <= 0x3f666666) {
+      r = - native_log(x);
+      if (ix >= 0x3f3b4a20) {
+        y = 1 - x;
+        i = 0;
+      } else if (ix >= 0x3e6d3308) {
+        y = x - (tc - 1);
+        i = 1;
+      } else {
+        y = x;
+        i = 2;
+      }
+    } else {
+      r = 0;
+      if (ix >= 0x3fdda618) {
+        y = 2 - x;
+        i = 0;
+      } else if (ix >= 0x3F9da620) {
+        y = x - tc;
+        i = 1;
+      } else {
+        y = x - 1;
+        i = 2;
+      }
+    }
+    switch (i) {
+    case 0:
+      z = y * y;
+      p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
+      p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
+      p = y * p1 + p2;
+      r += (p - .5f * y);
+      break;
+    case 1:
+      z = y * y;
+      w = z * y;
+      p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
+      p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
+      p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
+      p = z * p1 - (tt - w * (p2 + y * p3));
+      r += (tf + p);
+      break;
+    case 2:
+      p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+      p2 = 1 + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+      r += (-.5f * y + p1 / p2);
+    }
+  } else if (ix < 0x41000000) {
+    i = x;
+    t = 0;
+    y = x - i;
+    p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6))))));
+    q = 1 + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
+    r = .5f * y + p / q;
+    z = 1;
+    switch (i) {
+    case 7:
+      z *= (y + 6.f);
+    case 6:
+      z *= (y + 5.f);
+    case 5:
+      z *= (y + 4.f);
+    case 4:
+      z *= (y + 3.f);
+    case 3:
+      z *= (y + 2.f);
+      r += native_log(z);
+      break;
+    }
+  } else if (ix < 0x5c800000) {
+    t = native_log(x);
+    z = 1 / x;
+    y = z * z;
+    w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
+    r = (x - .5f) * (t - 1) + w;
+  } else
+    r = x * (native_log(x) - 1);
+  if (hx < 0)
+    r = nadj - r;
+  return r;
+}
+
+INLINE_OVERLOADABLE float lgamma(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+    const float
+        zero=  0.,
+        one =  1.0000000000e+00,
+        pi  =  3.1415927410e+00,
+        a0  =  7.7215664089e-02,
+        a1  =  3.2246702909e-01,
+        a2  =  6.7352302372e-02,
+        a3  =  2.0580807701e-02,
+        a4  =  7.3855509982e-03,
+        a5  =  2.8905137442e-03,
+        a6  =  1.1927076848e-03,
+        a7  =  5.1006977446e-04,
+        a8  =  2.2086278477e-04,
+        a9  =  1.0801156895e-04,
+        a10 =  2.5214456400e-05,
+        a11 =  4.4864096708e-05,
+        tc  =  1.4616321325e+00,
+        tf  = -1.2148628384e-01,
+        tt  =  6.6971006518e-09,
+        t0  =  4.8383611441e-01,
+        t1  = -1.4758771658e-01,
+        t2  =  6.4624942839e-02,
+        t3  = -3.2788541168e-02,
+        t4  =  1.7970675603e-02,
+        t5  = -1.0314224288e-02,
+        t6  =  6.1005386524e-03,
+        t7  = -3.6845202558e-03,
+        t8  =  2.2596477065e-03,
+        t9  = -1.4034647029e-03,
+        t10 =  8.8108185446e-04,
+        t11 = -5.3859531181e-04,
+        t12 =  3.1563205994e-04,
+        t13 = -3.1275415677e-04,
+        t14 =  3.3552918467e-04,
+        u0  = -7.7215664089e-02,
+        u1  =  6.3282704353e-01,
+        u2  =  1.4549225569e+00,
+        u3  =  9.7771751881e-01,
+        u4  =  2.2896373272e-01,
+        u5  =  1.3381091878e-02,
+        v1  =  2.4559779167e+00,
+        v2  =  2.1284897327e+00,
+        v3  =  7.6928514242e-01,
+        v4  =  1.0422264785e-01,
+        v5  =  3.2170924824e-03,
+        s0  = -7.7215664089e-02,
+        s1  =  2.1498242021e-01,
+        s2  =  3.2577878237e-01,
+        s3  =  1.4635047317e-01,
+        s4  =  2.6642270386e-02,
+        s5  =  1.8402845599e-03,
+        s6  =  3.1947532989e-05,
+        r1  =  1.3920053244e+00,
+        r2  =  7.2193557024e-01,
+        r3  =  1.7193385959e-01,
+        r4  =  1.8645919859e-02,
+        r5  =  7.7794247773e-04,
+        r6  =  7.3266842264e-06,
+        w0  =  4.1893854737e-01,
+        w1  =  8.3333335817e-02,
+        w2  = -2.7777778450e-03,
+        w3  =  7.9365057172e-04,
+        w4  = -5.9518753551e-04,
+        w5  =  8.3633989561e-04,
+        w6  = -1.6309292987e-03;
+	float t, y, z, nadj, p, p1, p2, p3, q, r, w;
+	int i, hx, ix;
+	nadj = 0;
+	hx = *(int *)&x;
+	ix = hx & 0x7fffffff;
+	if (ix >= 0x7f800000)
+		return x * x;
+	if (ix == 0)
+		return ((x + one) / zero);
+	if (ix < 0x1c800000) {
+		if (hx < 0) {
+			return -native_log(-x);
+		} else
+			return -native_log(x);
+	}
+	if (hx < 0) {
+		if (ix >= 0x4b000000)
+			return ((-x) / zero);
+		t = __gen_ocl_internal_sinpi(x);
+		if (t == zero)
+			return ((-x) / zero);
+		nadj = native_log(pi / __gen_ocl_fabs(t * x));
+		x = -x;
+	}
+	if (ix == 0x3f800000 || ix == 0x40000000)
+		r = 0;
+	else if (ix < 0x40000000) {
+		if (ix <= 0x3f666666) {
+			r = -native_log(x);
+			if (ix >= 0x3f3b4a20) {
+				y = one - x;
+				i = 0;
+			} else if (ix >= 0x3e6d3308) {
+				y = x - (tc - one);
+				i = 1;
+			} else {
+				y = x;
+				i = 2;
+			}
+		} else {
+			r = zero;
+			if (ix >= 0x3fdda618) {
+				y = (float) 2.0 - x;
+				i = 0;
+			}
+			else if (ix >= 0x3F9da620) {
+				y = x - tc;
+				i = 1;
+			}
+			else {
+				y = x - one;
+				i = 2;
+			}
+		}
+		switch (i) {
+		case 0:
+			z = y * y;
+			p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
+			p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
+			p = y * p1 + p2;
+			r += (p - (float) 0.5 * y);
+			break;
+		case 1:
+			z = y * y;
+			w = z * y;
+			p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
+			p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
+			p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
+			p = z * p1 - (tt - w * (p2 + y * p3));
+			r += (tf + p);
+			break;
+		case 2:
+			p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+			p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+			r += (-(float) 0.5 * y + p1 / p2);
+		}
+	} else if (ix < 0x41000000) {
+		i = (int) x;
+		t = zero;
+		y = x - (float) i;
+		p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
+		q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
+		r = .5f * y + p / q;
+		z = one;
+		switch (i) {
+		case 7:
+			z *= (y + (float) 6.0);
+		case 6:
+			z *= (y + (float) 5.0);
+		case 5:
+			z *= (y + (float) 4.0);
+		case 4:
+			z *= (y + (float) 3.0);
+		case 3:
+			z *= (y + (float) 2.0);
+			r += native_log(z);
+			break;
+		}
+
+	} else if (ix < 0x5c800000) {
+		t = native_log(x);
+		z = one / x;
+		y = z * z;
+		w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
+		r = (x - .5f) * (t - one) + w;
+	} else
+		r = x * (native_log(x) - one);
+	if (hx < 0)
+		r = nadj - r;
+	return r;
+}
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+#define BODY \
+    const float  \
+        zero=  0.,  \
+        one =  1.0000000000e+00,  \
+        pi  =  3.1415927410e+00,  \
+        a0  =  7.7215664089e-02,  \
+        a1  =  3.2246702909e-01,  \
+        a2  =  6.7352302372e-02,  \
+        a3  =  2.0580807701e-02,  \
+        a4  =  7.3855509982e-03,  \
+        a5  =  2.8905137442e-03,  \
+        a6  =  1.1927076848e-03,  \
+        a7  =  5.1006977446e-04,  \
+        a8  =  2.2086278477e-04,  \
+        a9  =  1.0801156895e-04,  \
+        a10 =  2.5214456400e-05,  \
+        a11 =  4.4864096708e-05,  \
+        tc  =  1.4616321325e+00,  \
+        tf  = -1.2148628384e-01,  \
+        tt  =  6.6971006518e-09,  \
+        t0  =  4.8383611441e-01,  \
+        t1  = -1.4758771658e-01,  \
+        t2  =  6.4624942839e-02,  \
+        t3  = -3.2788541168e-02,  \
+        t4  =  1.7970675603e-02,  \
+        t5  = -1.0314224288e-02,  \
+        t6  =  6.1005386524e-03,  \
+        t7  = -3.6845202558e-03,  \
+        t8  =  2.2596477065e-03,  \
+        t9  = -1.4034647029e-03,  \
+        t10 =  8.8108185446e-04,  \
+        t11 = -5.3859531181e-04,  \
+        t12 =  3.1563205994e-04,  \
+        t13 = -3.1275415677e-04,  \
+        t14 =  3.3552918467e-04,  \
+        u0  = -7.7215664089e-02,  \
+        u1  =  6.3282704353e-01,  \
+        u2  =  1.4549225569e+00,  \
+        u3  =  9.7771751881e-01,  \
+        u4  =  2.2896373272e-01,  \
+        u5  =  1.3381091878e-02,  \
+        v1  =  2.4559779167e+00,  \
+        v2  =  2.1284897327e+00,  \
+        v3  =  7.6928514242e-01,  \
+        v4  =  1.0422264785e-01,  \
+        v5  =  3.2170924824e-03,  \
+        s0  = -7.7215664089e-02,  \
+        s1  =  2.1498242021e-01,  \
+        s2  =  3.2577878237e-01,  \
+        s3  =  1.4635047317e-01,  \
+        s4  =  2.6642270386e-02,  \
+        s5  =  1.8402845599e-03,  \
+        s6  =  3.1947532989e-05,  \
+        r1  =  1.3920053244e+00,  \
+        r2  =  7.2193557024e-01,  \
+        r3  =  1.7193385959e-01,  \
+        r4  =  1.8645919859e-02,  \
+        r5  =  7.7794247773e-04,  \
+        r6  =  7.3266842264e-06,  \
+        w0  =  4.1893854737e-01,  \
+        w1  =  8.3333335817e-02,  \
+        w2  = -2.7777778450e-03,  \
+        w3  =  7.9365057172e-04,  \
+        w4  = -5.9518753551e-04,  \
+        w5  =  8.3633989561e-04,  \
+        w6  = -1.6309292987e-03;  \
+	float t, y, z, nadj, p, p1, p2, p3, q, r, w;  \
+	int i, hx, ix;  \
+	nadj = 0;  \
+	hx = *(int *)&x;  \
+	*signgamp = 1;  \
+	ix = hx & 0x7fffffff;  \
+	if (ix >= 0x7f800000)  \
+		return x * x;  \
+	if (ix == 0)  \
+		return ((x + one) / zero);  \
+	if (ix < 0x1c800000) {  \
+		if (hx < 0) {  \
+			*signgamp = -1;  \
+			return -native_log(-x);  \
+		} else  \
+			return -native_log(x);  \
+	}  \
+	if (hx < 0) {  \
+		if (ix >= 0x4b000000)  \
+			return ((-x) / zero);  \
+		t = __gen_ocl_internal_sinpi(x);  \
+		if (t == zero)  \
+			return ((-x) / zero);  \
+		nadj = native_log(pi / __gen_ocl_fabs(t * x));  \
+		if (t < zero)  \
+			*signgamp = -1;  \
+		x = -x;  \
+	}  \
+	if (ix == 0x3f800000 || ix == 0x40000000)  \
+		r = 0;  \
+	else if (ix < 0x40000000) {  \
+		if (ix <= 0x3f666666) {  \
+			r = -native_log(x);  \
+			if (ix >= 0x3f3b4a20) {  \
+				y = one - x;  \
+				i = 0;  \
+			} else if (ix >= 0x3e6d3308) {  \
+				y = x - (tc - one);  \
+				i = 1;  \
+			} else {  \
+				y = x;  \
+				i = 2;  \
+			}  \
+		} else {  \
+			r = zero;  \
+			if (ix >= 0x3fdda618) {  \
+				y = (float) 2.0 - x;  \
+				i = 0;  \
+			}  \
+			else if (ix >= 0x3F9da620) {  \
+				y = x - tc;  \
+				i = 1;  \
+			}  \
+			else {  \
+				y = x - one;  \
+				i = 2;  \
+			}  \
+		}  \
+		switch (i) {  \
+		case 0:  \
+			z = y * y;  \
+			p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));  \
+			p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));  \
+			p = y * p1 + p2;  \
+			r += (p - (float) 0.5 * y);  \
+			break;  \
+		case 1:  \
+			z = y * y;  \
+			w = z * y;  \
+			p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));  \
+			p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));  \
+			p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));  \
+			p = z * p1 - (tt - w * (p2 + y * p3));  \
+			r += (tf + p);  \
+			break;  \
+		case 2:  \
+			p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));  \
+			p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));  \
+			r += (-(float) 0.5 * y + p1 / p2);  \
+		}  \
+	} else if (ix < 0x41000000) {  \
+		i = (int) x;  \
+		t = zero;  \
+		y = x - (float) i;  \
+		p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));  \
+		q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));  \
+		r = .5f * y + p / q;  \
+		z = one;  \
+		switch (i) {  \
+		case 7:  \
+			z *= (y + (float) 6.0);  \
+		case 6:  \
+			z *= (y + (float) 5.0);  \
+		case 5:  \
+			z *= (y + (float) 4.0);  \
+		case 4:  \
+			z *= (y + (float) 3.0);  \
+		case 3:  \
+			z *= (y + (float) 2.0);  \
+			r += native_log(z);  \
+			break;  \
+		}  \
+		  \
+	} else if (ix < 0x5c800000) {  \
+		t = native_log(x);  \
+		z = one / x;  \
+		y = z * z;  \
+		w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));  \
+		r = (x - .5f) * (t - one) + w;  \
+	} else  \
+		r = x * (native_log(x) - one);  \
+	if (hx < 0)  \
+		r = nadj - r;  \
+	return r;
+INLINE_OVERLOADABLE float lgamma_r(float x, global int *signgamp) { BODY; }
+INLINE_OVERLOADABLE float lgamma_r(float x, local int *signgamp) { BODY; }
+INLINE_OVERLOADABLE float lgamma_r(float x, private int *signgamp) { BODY; }
+#undef BODY
+
+INLINE_OVERLOADABLE float native_log10(float x) {
+  return native_log2(x) * 0.3010299956f;
+}
+INLINE_OVERLOADABLE float log1p(float x) {
+/*
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  const float
+  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
+  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
+  two25 =    3.355443200e+07, /* 0x4c000000 */
+  Lp1 = 6.6666668653e-01, /* 3F2AAAAB */
+  Lp2 = 4.0000000596e-01, /* 3ECCCCCD */
+  Lp3 = 2.8571429849e-01, /* 3E924925 */
+  Lp4 = 2.2222198546e-01, /* 3E638E29 */
+  Lp5 = 1.8183572590e-01, /* 3E3A3325 */
+  Lp6 = 1.5313838422e-01, /* 3E1CD04F */
+  Lp7 = 1.4798198640e-01; /* 3E178897 */
+  const float zero = 0.0;
+  float hfsq,f,c,s,z,R,u;
+  int k,hx,hu,ax;
+  union {float f; unsigned i;} un;
+  un.f = x;  hx = un.i;
+  ax = hx&0x7fffffff;
+
+  k = 1;
+  if (hx < 0x3ed413d7) {      /* x < 0.41422  */
+      if(ax>=0x3f800000) {    /* x <= -1.0 */
+    if(x==(float)-1.0) return -two25/zero; /* log1p(-1)=+inf */
+    else return (x-x)/(x-x);  /* log1p(x<-1)=NaN */
+      }
+      if(ax<0x31000000) {     /* |x| < 2**-29 */
+    if(two25+x>zero     /* raise inexact */
+              &&ax<0x24800000)    /* |x| < 2**-54 */
+        return x;
+    else
+        return x - x*x*(float)0.5;
+      }
+      if(hx>0||hx<=((int)0xbe95f61f)) {
+    k=0;f=x;hu=1;}  /* -0.2929<x<0.41422 */
+  }
+  if (hx >= 0x7f800000) return x+x;
+  if(k!=0) {
+      if(hx<0x5a000000) {
+    u  = (float)1.0+x;
+
+    un.f = u; hu = un.i;
+          k  = (hu>>23)-127;
+    /* correction term */
+          c  = (k>0)? (float)1.0-(u-x):x-(u-(float)1.0);
+    c /= u;
+      } else {
+    u  = x;
+    un.f = u; hu = un.i;
+          k  = (hu>>23)-127;
+    c  = 0;
+      }
+      hu &= 0x007fffff;
+      if(hu<0x3504f7) {
+          un.i = hu|0x3f800000; u = un.f;/* normalize u */
+      } else {
+          k += 1;
+          un.i = hu|0x3f000000; u = un.f;  /* normalize u/2 */
+          hu = (0x00800000-hu)>>2;
+      }
+      f = u-(float)1.0;
+  }
+  hfsq=(float)0.5*f*f;
+  if(hu==0) { /* |f| < 2**-20 */
+      if(f==zero) { if(k==0) return zero;
+      else {c += k*ln2_lo; return k*ln2_hi+c;} }
+      R = hfsq*((float)1.0-(float)0.66666666666666666*f);
+      if(k==0) return f-R; else
+             return k*ln2_hi-((R-(k*ln2_lo+c))-f);
+  }
+  s = f/((float)2.0+f);
+  z = s*s;
+  R = z*(Lp1+z*(Lp2+z*(Lp3+z*(Lp4+z*(Lp5+z*(Lp6+z*Lp7))))));
+  if(k==0) return f-(hfsq-s*(hfsq+R)); else
+     return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
+
+}
+INLINE_OVERLOADABLE float logb(float x) {
+union {float f; unsigned i;} u;
+  u.f = x;
+  int e =  ((u.i & 0x7f800000) >> 23);
+  if(e == 0) {
+    /* sub normal or +/-0 */
+    return -INFINITY;
+  } else if(e == 0xff) {
+    /* inf & nan */
+    return x*x;
+  } else {
+    return (float)(e-127);
+  }
+}
+#define FP_ILOGB0 (-0x7FFFFFFF-1)
+#define FP_ILOGBNAN FP_ILOGB0
+INLINE_OVERLOADABLE int ilogb(float x) {
+  union { int i; float f; } u;
+  if (isnan(x))
+    return FP_ILOGBNAN;
+  if (isinf(x))
+    return 0x7FFFFFFF;
+  u.f = x;
+  u.i &= 0x7fffffff;
+  if (u.i == 0)
+    return FP_ILOGB0;
+  if (u.i >= 0x800000)
+    return (u.i >> 23) - 127;
+  int r = -126;
+  int a = u.i & 0x7FFFFF;
+  while(a < 0x800000) {
+    a <<= 1;
+    r --;
+  }
+  return r;
+}
+INLINE_OVERLOADABLE float nan(uint code) {
+  return NAN;
+}
+INLINE_OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
+INLINE_OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
+INLINE_OVERLOADABLE float native_tan(float x) {
+  return native_sin(x) / native_cos(x);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
+  float sign = 1.0f;
+  int ix;
+  if(isinf(x)) return NAN;
+  if(x < 0.0f) { x = -x; sign = -1.0f; }
+  GEN_OCL_GET_FLOAT_WORD(ix, x);
+  if(x> 0x1.0p24) return 0.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  int n = __gen_ocl_internal_floor(m*4.0f);
+  if(m == 0.5f) {
+    return (ix&0x1) == 0 ? sign*INFINITY : sign*-INFINITY;
+  }
+  if(m == 0.0f) {
+    return (ix&0x1) == 0 ? 0.0f : -0.0f;
+  }
+
+  switch(n) {
+    case 0:
+      return sign * __kernel_tanf(m*M_PI_F, 0.0f, 1);
+    case 1:
+      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
+    case 2:
+      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
+    default:
+      return sign * -1.0f*__kernel_tanf((1.0f-m)*M_PI_F, 0.0f, 1);
+  }
+}
+INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_exp(x); }
+INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(M_LOG2E_F*x); }
+INLINE_OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
+  /* copied from fdlibm */
+  const unsigned
+  B1 = 709958130, /* B1 = (84+2/3-0.03306235651)*2**23 */
+  B2 = 642849266; /* B2 = (76+2/3-0.03306235651)*2**23 */
+
+  const float
+  C =  5.4285717010e-01, /* 19/35     = 0x3f0af8b0 */
+  D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
+  E =  1.4142856598e+00, /* 99/70     = 0x3fb50750 */
+  F =  1.6071428061e+00, /* 45/28     = 0x3fcdb6db */
+  G =  3.5714286566e-01; /* 5/14      = 0x3eb6db6e */
+
+  float r,s,t, w;
+  int hx;
+  uint sign;
+  uint high;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  sign=hx&0x80000000;     /* sign= sign(x) */
+  hx  ^=sign;
+  if(hx>=0x7f800000) return(x+x); /* cbrt(NaN,INF) is itself */
+  if(hx==0)
+      return(x);    /* cbrt(0) is itself */
+
+  GEN_OCL_SET_FLOAT_WORD(x,hx); /* x <- |x| */
+    /* rough cbrt to 5 bits */
+  if(hx<0x00800000)     /* subnormal number */
+    {
+    //SET_FLOAT_WORD(t,0x4b800000); /* set t= 2**24 */
+     //t*=x; GET_FLOAT_WORD(high,t); SET_FLOAT_WORD(t,high/3+B2);
+      t = (sign = 0) ? 0.0f : -0.0f;
+      return t;
+    }
+  else
+    GEN_OCL_SET_FLOAT_WORD(t,hx/3+B1);
+
+
+    /* new cbrt to 23 bits */
+  r=t*t/x;
+  s=C+r*t;
+  t*=G+F/(s+E+D/s);
+    /* one step newton iteration to 53 bits with error less than 0.667 ulps */
+  s=t*t;    /* t*t is exact */
+  r=x/s;
+  w=t+t;
+  r=(r-t)/(w+r);  /* r-s is exact */
+  t=t+t*r;
+
+    /* retore the sign bit */
+  GEN_OCL_GET_FLOAT_WORD(high,t);
+  GEN_OCL_SET_FLOAT_WORD(t,high|sign);
+  return(t);
+}
+
+#define BODY \
+  *cosval = cos(x); \
+  return sin(x);
+INLINE_OVERLOADABLE float sincos(float x, global float *cosval) { BODY; }
+INLINE_OVERLOADABLE float sincos(float x, local float *cosval) { BODY; }
+INLINE_OVERLOADABLE float sincos(float x, private float *cosval) { BODY; }
+#undef BODY
+
+INLINE float __gen_ocl_asin_util(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  float
+  pS0 =  1.66666666666666657415e-01,
+  pS1 = -3.25565818622400915405e-01,
+  pS2 =  2.01212532134862925881e-01,
+  pS3 = -4.00555345006794114027e-02,
+  pS4 =  7.91534994289814532176e-04,
+  pS5 =  3.47933107596021167570e-05,
+  qS1 = -2.40339491173441421878e+00,
+  qS2 =  2.02094576023350569471e+00,
+  qS3 = -6.88283971605453293030e-01,
+  qS4 =  7.70381505559019352791e-02;
+
+  float t = x*x;
+  float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*(pS4+t*pS5)))));
+  float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
+  float w = p / q;
+  return x + x*w;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_asin(float x) {
+  uint ix;
+  union { uint i; float f; } u;
+  u.f = x;
+  ix = u.i & 0x7fffffff;
+  if(ix == 0x3f800000) {
+    return x * M_PI_2_F;  /* asin(|1|)=+-pi/2 with inexact */
+  }
+  if(ix > 0x3f800000) {            /* |x|>= 1 */
+    return  NAN;          /* asin(|x|>1) is NaN */
+  }
+
+  if(ix < 0x32000000) {            /* if |x| < 2**-27 */
+    if(HUGE_VALF + x > FLT_ONE) return x;   /* return x with inexact if x!=0*/
+  }
+
+  if(x < -0.5) {
+    return 2 * __gen_ocl_asin_util(native_sqrt((1+x) / 2)) - M_PI_2_F;
+  } else if(x > 0.5) {
+    return M_PI_2_F - 2 * __gen_ocl_asin_util(native_sqrt((1-x) / 2));
+  } else {
+    return __gen_ocl_asin_util(x);
+  }
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
+  return __gen_ocl_internal_asin(x) / M_PI_F;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_acos(float x) {
+  if(x > 0.5)
+    return 2 * __gen_ocl_asin_util(native_sqrt((1-x)/2));
+  else
+    return M_PI_2_F - __gen_ocl_internal_asin(x);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
+  return __gen_ocl_internal_acos(x) / M_PI_F;
+}
+__constant float atanhi[4] = {
+  4.6364760399e-01, /* atan(0.5)hi 0x3eed6338 */
+  7.8539812565e-01, /* atan(1.0)hi 0x3f490fda */
+  9.8279368877e-01, /* atan(1.5)hi 0x3f7b985e */
+  1.5707962513e+00, /* atan(inf)hi 0x3fc90fda */
+};
+__constant float atanlo[4] = {
+  5.0121582440e-09, /* atan(0.5)lo 0x31ac3769 */
+  3.7748947079e-08, /* atan(1.0)lo 0x33222168 */
+  3.4473217170e-08, /* atan(1.5)lo 0x33140fb4 */
+  7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
+};
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_atan(float x) {
+  /* copied from fdlibm */
+  float aT[11];
+  aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
+  aT[1] =  -2.0000000298e-01; /* 0xbe4ccccd */
+  aT[2] =   1.4285714924e-01; /* 0x3e124925 */
+  aT[3] =  -1.1111110449e-01; /* 0xbde38e38 */
+  aT[4] =   9.0908870101e-02; /* 0x3dba2e6e */
+  aT[5] =  -7.6918758452e-02; /* 0xbd9d8795 */
+  aT[6] =   6.6610731184e-02; /* 0x3d886b35 */
+  aT[7] =  -5.8335702866e-02; /* 0xbd6ef16b */
+  aT[8] =   4.9768779427e-02; /* 0x3d4bda59 */
+  aT[9] =  -3.6531571299e-02; /* 0xbd15a221 */
+  aT[10] =   1.6285819933e-02; /* 0x3c8569d7 */
+  const float one = 1.0, huge = 1.0e30;
+
+  float w,s1,s2,z;
+  int ix,hx,id;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  if(ix>=0x50800000) {  /* if |x| >= 2^34 */
+      if(ix>0x7f800000)
+    return x+x;   /* NaN */
+      if(hx>0) return  atanhi[3]+atanlo[3];
+      else     return -atanhi[3]-atanlo[3];
+  } if (ix < 0x3ee00000) {  /* |x| < 0.4375 */
+      if (ix < 0x31000000) {  /* |x| < 2^-29 */
+    if(huge+x>one) return x;  /* raise inexact */
+      }
+      id = -1;
+  } else {
+  x = __gen_ocl_fabs(x);
+  if (ix < 0x3f980000) {    /* |x| < 1.1875 */
+      if (ix < 0x3f300000) {  /* 7/16 <=|x|<11/16 */
+    id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
+      } else {      /* 11/16<=|x|< 19/16 */
+    id = 1; x  = (x-one)/(x+one);
+      }
+  } else {
+      if (ix < 0x401c0000) {  /* |x| < 2.4375 */
+    id = 2; x  = (x-(float)1.5)/(one+(float)1.5*x);
+      } else {      /* 2.4375 <= |x| < 2^66 */
+    id = 3; x  = -(float)1.0/x;
+      }
+  }}
+    /* end of argument reduction */
+  z = x*x;
+  w = z*z;
+    /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
+  s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*(aT[6]+w*(aT[8]+w*aT[10])))));
+  s2 = w*(aT[1]+w*(aT[3]+w*(aT[5]+w*(aT[7]+w*aT[9]))));
+  if (id<0) return x - x*(s1+s2);
+  else {
+      z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
+      return (hx<0)? -z:z;
+  }
+
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
+  return __gen_ocl_internal_atan(x) / M_PI_F;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
+  return M_2_SQRTPI_F * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 10 - __gen_ocl_pow(x, 7) / 42 + __gen_ocl_pow(x, 9) / 216);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
+  return 1 - __gen_ocl_internal_erf(x);
+}
+
+// XXX work-around PTX profile
+#define sqrt native_sqrt
+INLINE_OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
+  /* copied from fdlibm */
+  float z;
+  int k,m,hx,hy,ix,iy;
+  const float
+  tiny  = 1.0e-30,
+  zero  = 0.0,
+  pi_o_4  = 7.8539818525e-01, /* 0x3f490fdb */
+  pi_o_2  = 1.5707963705e+00, /* 0x3fc90fdb */
+  pi      = 3.1415927410e+00, /* 0x40490fdb */
+  pi_lo   = -8.7422776573e-08; /* 0xb3bbbd2e */
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  GEN_OCL_GET_FLOAT_WORD(hy,y);
+  iy = hy&0x7fffffff;
+
+  if((ix>0x7f800000)||
+     (iy>0x7f800000)) /* x or y is NaN */
+     return x+y;
+  if(hx==0x3f800000) return z=__gen_ocl_internal_atan(y);   /* x=1.0 */
+  m = ((hy>>31)&1)|((hx>>30)&2);  /* 2*sign(x)+sign(y) */
+
+    /* when y = 0 */
+  if(iy==0) {
+      switch(m) {
+    case 0:
+    case 1: return y;   /* atan(+-0,+anything)=+-0 */
+    case 2: return  pi+tiny;/* atan(+0,-anything) = pi */
+    case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
+      }
+  }
+    /* when x = 0 */
+  if(ix==0) return (hy<0)?  -pi_o_2-tiny: pi_o_2+tiny;
+
+  /* both are denorms. Gen does not support denorm, so we convert to normal float number*/
+  if(ix <= 0x7fffff && iy <= 0x7fffff) {
+    x = (float)(ix) * (1.0f - ((hx>>30) & 0x2));
+    y = (float)(iy) * (1.0f - ((hy>>30) & 0x2));
+  }
+
+    /* when x is INF */
+  if(ix==0x7f800000) {
+      if(iy==0x7f800000) {
+    switch(m) {
+        case 0: return  pi_o_4+tiny;/* atan(+INF,+INF) */
+        case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
+        case 2: return  (float)3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
+        case 3: return (float)-3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
+    }
+      } else {
+    switch(m) {
+        case 0: return  zero  ; /* atan(+...,+INF) */
+        case 1: return -zero  ; /* atan(-...,+INF) */
+        case 2: return  pi+tiny  ;  /* atan(+...,-INF) */
+        case 3: return -pi-tiny  ;  /* atan(-...,-INF) */
+    }
+      }
+  }
+    /* when y is INF */
+  if(iy==0x7f800000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
+
+    /* compute y/x */
+  k = (iy-ix)>>23;
+  if(k > 60) z=pi_o_2+(float)0.5*pi_lo;   /* |y/x| >  2**60 */
+  else if(hx<0&&k<-60) z=0.0;   /* |y|/x < -2**60 */
+  else z=__gen_ocl_internal_atan(__gen_ocl_fabs(y/x)); /* safe to do y/x */
+  switch (m) {
+      case 0: return       z  ; /* atan(+,+) */
+      case 1: {
+              uint zh;
+          GEN_OCL_GET_FLOAT_WORD(zh,z);
+          GEN_OCL_SET_FLOAT_WORD(z,zh ^ 0x80000000);
+        }
+        return       z  ; /* atan(-,+) */
+      case 2: return  pi-(z-pi_lo);/* atan(+,-) */
+      default: /* case 3 */
+            return  (z-pi_lo)-pi;/* atan(-,-) */
+  }
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
+  uint ix = as_uint(x), iy = as_uint(y),
+       pos_zero = 0, neg_zero = 0x80000000u,
+       pos_inf = 0x7f800000, neg_inf = 0xff800000u;
+  if(iy == pos_zero) {
+    if(ix == pos_zero)
+      return 0;
+    if(ix == neg_zero)
+      return 1;
+    if(x < 0)
+      return 1;
+    if(x > 0)
+      return 0;
+  }
+  if(iy == neg_zero) {
+    if(ix == pos_zero)
+      return -0.f;
+    if(ix == neg_zero)
+      return -1;
+    if(x < 0)
+      return -1;
+    if(x > 0)
+      return -0.f;
+  }
+  if((ix & 0x7fffffff) == 0) {
+    if(y < 0)
+      return -.5f;
+    if(y > 0)
+      return .5f;
+  }
+  if(ix == pos_inf) {
+    if(y > 0 && iy != pos_inf)
+      return 0;
+    if(y < 0 && iy != neg_inf)
+      return -0.f;
+  }
+  if(ix == neg_inf) {
+    if(y > 0 && iy != pos_inf)
+      return 1;
+    if(y < 0 && iy != neg_inf)
+      return -1;
+  }
+  if(iy == pos_inf) {
+    if(ix == pos_inf)
+      return 0.25f;
+    if(ix == neg_inf)
+      return 0.75f;
+    if(x >= 0 || x <= 0)
+      return 0.5f;
+  }
+  if(iy == neg_inf) {
+    if(ix == pos_inf)
+      return -0.25f;
+    if(ix == neg_inf)
+      return -0.75f;
+    if(x >= 0 || x <= 0)
+      return -0.5f;
+  }
+  return __gen_ocl_internal_atan2(y, x) / M_PI_F;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) {
+  float y = __gen_ocl_rndz(x);
+  if (__gen_ocl_fabs(x - y) >= 0.5f)
+    y += __gen_ocl_internal_copysign(1.f, x);
+  return y;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
+INLINE_OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
+  return __gen_ocl_rnde(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x) {
+  //use native instruction when it has enough precision
+  if (x > -0x1.6p1 && x < 0x1.6p1)
+  {
+    return native_exp(x);
+  }
+
+  float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
+  u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
+  twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
+  ivln2	 =	1.4426950216e+00; /* 0x3fb8aa3b =1/ln2 */
+  float y,hi=0.0,lo=0.0,t;
+  int k=0,xsb;
+  unsigned hx;
+  float ln2HI_0 = 6.9313812256e-01;	/* 0x3f317180 */
+  float ln2HI_1 = -6.9313812256e-01;	/* 0xbf317180 */
+  float ln2LO_0 = 9.0580006145e-06;  	/* 0x3717f7d1 */
+  float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
+  float half_0 = 0.5;
+  float half_1 =	-0.5;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  xsb = (hx>>31)&1;		/* sign bit of x */
+  hx &= 0x7fffffff;		/* high word of |x| */
+
+  /* filter out non-finite argument */
+  if(hx >= 0x42b17218) {			/* if |x|>=88.721... */
+    // native_exp already handled this
+    return native_exp(x);
+  }
+
+  /* argument reduction */
+  if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */
+    if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */
+      hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0);
+      lo= xsb == 1? ln2LO_1 : ln2LO_0;
+      k = 1-xsb-xsb;
+    } else {
+      float tmp = xsb == 1 ? half_1 : half_0;
+      k  = ivln2*x+tmp;
+      t  = k;
+      hi = x - t*ln2HI_0;	/* t*ln2HI is exact here */
+      lo = t*ln2LO_0;
+    }
+    x  = hi - lo;
+  }
+
+  y = native_exp(x);
+  if(k >= -125) {
+    unsigned hy;
+    GEN_OCL_GET_FLOAT_WORD(hy,y);
+    GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23));	/* add k to y's exponent */
+    return y;
+  } else {
+    unsigned hy;
+    GEN_OCL_GET_FLOAT_WORD(hy,y);
+    GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
+    return y*twom100;
+  }
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
+  //return x-y*__gen_ocl_rndz(x/y);
+  float one = 1.0;
+  float Zero[2];
+  int n,hx,hy,hz,ix,iy,sx,i;
+  Zero[0] = 0.0;
+  Zero[1] = -0.0;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_GET_FLOAT_WORD(hy,y);
+  sx = hx&0x80000000;		/* sign of x */
+  hx ^=sx;		/* |x| */
+  hy &= 0x7fffffff;	/* |y| */
+  /* purge off exception values */
+  if(hy==0||(hx>=0x7f800000)||		/* y=0,or x not finite */
+  (hy>0x7f800000))			/* or y is NaN */
+    return (x*y)/(x*y);
+  if(hx<hy) return x;			/* |x|<|y| return x */
+  if(hx==hy)
+    return Zero[(unsigned)sx>>31];	/* |x|=|y| return x*0*/
+
+  /* determine ix = ilogb(x) */
+  if(hx<0x00800000) {	/* subnormal x */
+    for (ix = -126,i=(hx<<8); i>0; i<<=1) ix -=1;
+  } else ix = (hx>>23)-127;
+
+  /* determine iy = ilogb(y) */
+  if(hy<0x00800000) {	/* subnormal y */
+    for (iy = -126,i=(hy<<8); i>=0; i<<=1) iy -=1;
+  } else iy = (hy>>23)-127;
+
+  /* set up {hx,lx}, {hy,ly} and align y to x */
+  if(ix >= -126)
+    hx = 0x00800000|(0x007fffff&hx);
+  else {		/* subnormal x, shift x to normal */
+    n = -126-ix;
+    hx = hx<<n;
+  }
+  if(iy >= -126)
+    hy = 0x00800000|(0x007fffff&hy);
+  else {		/* subnormal y, shift y to normal */
+    n = -126-iy;
+    hy = hy<<n;
+  }
+  /* fix point fmod */
+  n = ix - iy;
+  while(n--) {
+    hz=hx-hy;
+    if(hz<0){hx = hx+hx;}
+    else {
+      if(hz==0)		/* return sign(x)*0 */
+        return Zero[(unsigned)sx>>31];
+      hx = hz+hz;
+    }
+  }
+  hz=hx-hy;
+  if(hz>=0) {hx=hz;}
+
+    /* convert back to floating value and restore the sign */
+  if(hx==0)			/* return sign(x)*0 */
+    return Zero[(unsigned)sx>>31];
+  while(hx<0x00800000) {		/* normalize x */
+    hx = hx+hx;
+    iy -= 1;
+  }
+  if(iy>= -126) {		/* normalize output */
+    hx = ((hx-0x00800000)|((iy+127)<<23));
+	GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
+   } else {		/* subnormal output */
+     n = -126 - iy;
+     hx >>= n;
+     GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
+     x *= one;		/* create necessary signal */
+  }
+  return x;		/* exact output */
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
+  //return __gen_ocl_pow(M_E_F, x) - 1;
+  float	Q1 = -3.3333335072e-02, /* 0xbd088889 */
+  ln2_hi = 6.9313812256e-01,	/* 0x3f317180 */
+  ln2_lo = 9.0580006145e-06,	/* 0x3717f7d1 */
+  Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
+  Q3 = -7.9365076090e-05, /* 0xb8a670cd */
+  Q4 = 4.0082177293e-06, /* 0x36867e54 */
+  Q5 = -2.0109921195e-07, /* 0xb457edbb */
+  huge = 1.0e30,
+  tiny = 1.0e-30,
+  ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  one	=  1.0,
+  o_threshold=  8.8721679688e+01;  /* 0x42b17180 */
+  float y,hi,lo,c,t,e,hxs,hfx,r1;
+  int k,xsb;
+  int hx;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  xsb = hx&0x80000000;
+  /* sign bit of x */
+  //if(xsb==0)
+  //y=x;
+  //else
+  //y= -x; /* y = |x| */
+  y = __gen_ocl_internal_fabs(x);
+  hx &= 0x7fffffff;		/* high word of |x| */
+  /* filter out huge and non-finite argument */
+  if(hx >= 0x4195b844) {			/* if |x|>=27*ln2 */
+    if(hx >= 0x42b17218) {		/* if |x|>=88.721... */
+      if(hx>0x7f800000)
+        return x+x; 	 /* NaN */
+      if(hx==0x7f800000)
+        return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
+      if(x > o_threshold)
+        return huge*huge; /* overflow */
+    }
+    if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
+      if(x+tiny<(float)0.0)	/* raise inexact */
+        return tiny-one;	/* return -1 */
+    }
+  }
+  /* argument reduction */
+  if(hx > 0x3eb17218) {/* if  |x| > 0.5 ln2 */
+    if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
+      if(xsb==0){
+        hi = x - ln2_hi; lo = ln2_lo;  k =  1;
+      }	else {
+        hi = x + ln2_hi; lo = -ln2_lo;  k = -1;
+      }
+    } else {
+      k  = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
+      t  = k;
+      hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
+      lo = t*ln2_lo;
+    }
+    x  = hi - lo;
+    c  = (hi-x)-lo;
+  } else if(hx < 0x33000000) {	/* when |x|<2**-25, return x */
+    //t = huge+x; /* return x with inexact flags when x!=0 */
+    //return x - (t-(huge+x));
+    return x;
+  } else k = 0;
+  /* x is now in primary range */
+  hfx = (float)0.5*x;
+  hxs = x*hfx;
+  r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+  t = (float)3.0-r1*hfx;
+  e = hxs*((r1-t)/((float)6.0 - x*t));
+  if(k==0)
+    return x - (x*e-hxs);		/* c is 0 */
+  else{
+    e = (x*(e-c)-c);
+    e -= hxs;
+    if(k== -1)return (float)0.5*(x-e)-(float)0.5;
+    if(k==1){
+      if(x < (float)-0.25)
+        return -(float)2.0*(e-(x+(float)0.5));
+      else
+        return  (one+(float)2.0*(x-e));
+    }
+    if (k <= -2 || k>56) {	 /* suffice to return exp(x)-1 */
+      int i;
+      y = one-(e-x);
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+      return y-one;
+    }
+    t = one;
+    if(k<23) {
+      int i;
+      GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
+      y = t-(e-x);
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+    } else {
+      int i;
+      GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23));	/* 2^-k */
+      y = x-(e+t);
+      y += one;
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+    }
+  }
+  return y;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
+  //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+  float one	= 1.0,
+  ln2	= 6.9314718246e-01;/* 0x3f317218 */
+  float t;
+  int hx;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  if(hx<0x3f800000) {	/* x < 1 */
+    return (x-x)/(x-x);
+  } else if(hx >=0x4d800000) {	/* x > 2**28 */
+    if(hx >=0x7f800000) {/* x is inf of NaN */
+      return x+x;
+    } else
+      return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
+  } else if (hx==0x3f800000) {
+    return 0.0;			/* acosh(1) = 0 */
+  } else if (hx > 0x40000000) {	/* 2**28 > x > 2 */
+    t=x*x;
+    return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));			
+  } else {			/* 1<x<2 */
+    t = x-one;
+    return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
+  }
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_asinh(float x){
+  //return native_log(x + native_sqrt(x * x + 1));
+  float one =  1.0000000000e+00, /* 0x3F800000 */
+  ln2 =  6.9314718246e-01, /* 0x3f317218 */
+  huge=  1.0000000000e+30;
+  float w;
+  int hx,ix;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  if(ix< 0x38000000) {	/* |x|<2**-14 */
+    if(huge+x>one) return x;	/* return x inexact except 0 */
+  }
+  if(ix>0x47000000) {/* |x| > 2**14 */
+    if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
+    w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
+  } else {
+    float xa = __gen_ocl_internal_fabs(x);
+    if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
+      w = __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa));
+    } else {		/* 2.0 > |x| > 2**-14 */
+      float t = xa*xa;
+      w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
+    }
+  }
+  return __gen_ocl_internal_copysign(w, x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x){
+  //return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+  float one = 1.0,
+  shuge = 1.0e37;
+  float t,w,h;
+  int ix,jx;
+  GEN_OCL_GET_FLOAT_WORD(jx,x);
+  ix = jx&0x7fffffff;
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) return x+x;
+  h = 0.5;
+  if (jx<0) h = -h;
+  /* |x| in [0,22], return sign(x)*0.5*(E+E/(E+1))) */
+  if (ix < 0x41b00000) {		/* |x|<22 */
+    if (ix<0x31800000)	/* |x|<2**-28 */
+      if(shuge+x>one) return x;/* sinh(tiny) = tiny with inexact */
+    t = __gen_ocl_internal_expm1(__gen_ocl_internal_fabs(x));
+    if(ix<0x3f800000) return h*((float)2.0*t-t*t/(t+one));
+      return h*(t+t/(t+one));
+  }
+  /* |x| in [22, log(maxdouble)] return 0.5*exp(|x|) */
+  if (ix < 0x42b17180)  return h*__gen_ocl_internal_exp(__gen_ocl_internal_fabs(x));
+  /* |x| in [log(maxdouble), overflowthresold] */
+  if (ix<=0x42b2d4fc) {
+    w = __gen_ocl_internal_exp((float)0.5*__gen_ocl_internal_fabs(x));
+    t = h*w;
+    return t*w;
+  }
+  /* |x| > overflowthresold, sinh(x) overflow */
+  return x*shuge;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
+  //float y = native_exp(-2 * x);
+  //return (1 - y) / (1 + y);
+  float one=1.0, two=2.0, tiny = 1.0e-30;
+  float t,z;
+  int jx,ix;
+  GEN_OCL_GET_FLOAT_WORD(jx,x);
+  ix = jx&0x7fffffff;
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) {
+    if (jx>=0)
+      return one/x+one; /* tanh(+-inf)=+-1 */
+    else
+      return one/x-one; /* tanh(NaN) = NaN */
+  }
+
+  if (ix < 0x41b00000) { /* |x|<22 */
+    if (ix == 0)
+      return x;		/* x == +-0 */
+    if (ix<0x24000000) 	/* |x|<2**-55 */
+      return x*(one+x);    	/* tanh(small) = small */
+    if (ix>=0x3f800000) {	/* |x|>=1  */
+      t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
+      z = one - two/(t+two);
+    } else {
+      t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
+      z= -t/(t+two);
+    }
+  } else { /* |x| > 22, return +-1 */
+    z = one - tiny;		/* raised inexact flag */
+  }
+  return (jx>=0)? z: -z;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
+  //return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+  float halF = 0.5,
+  huge = 1.0e+30,
+  tiny = 1.0e-30,
+  one = 1.0;
+  float t,w;
+  int ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  ix &= 0x7fffffff;
+  /* |x| in [0,22] */
+  if (ix < 0x41b00000) {
+    /* |x| in [0,0.5*ln2], return 1+expm1(|x|)^2/(2*exp(|x|)) */
+    if(ix<0x3eb17218) {
+      t = __gen_ocl_internal_expm1(__gen_ocl_fabs(x));
+      w = one+t;
+      if (ix<0x24000000) return w;	/* cosh(tiny) = 1 */
+      return one+(t*t)/(w+w);
+    }
+    /* |x| in [0.5*ln2,22], return (exp(|x|)+1/exp(|x|)/2; */
+    t = __gen_ocl_internal_exp(__gen_ocl_fabs(x));
+    return halF*t+halF/t;
+  }
+  /* |x| in [22, log(maxdouble)] return half*exp(|x|) */
+  if (ix < 0x42b17180)  return halF*__gen_ocl_internal_exp(__gen_ocl_fabs(x));
+  /* |x| in [log(maxdouble), overflowthresold] */
+  if (ix<=0x42b2d4fc) {
+    w = __gen_ocl_internal_exp(halF*__gen_ocl_fabs(x));
+    t = halF*w;
+    return t*w;
+  }
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) return x*x;
+  /* |x| > overflowthresold, cosh(x) overflow */
+  return huge*huge;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
+  //return x-y*__gen_ocl_rnde(x/y);
+  float zero = 0.0;
+  int hx,hp;
+  unsigned sx;
+  float p_half;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_GET_FLOAT_WORD(hp,p);
+  sx = hx&0x80000000;
+  hp &= 0x7fffffff;
+  hx &= 0x7fffffff;
+  /* purge off exception values */
+  if(hp==0) return (x*p)/(x*p);	        /* p = 0 */
+  if((hx>=0x7f800000)||               /* x not finite */
+    ((hp>0x7f800000)))	               /* p is NaN */
+    return (x*p)/(x*p);
+  if (hp<=0x7effffff) x = __gen_ocl_internal_fmod(x,p+p); /* now x < 2p */
+  if ((hx-hp)==0) return zero*x;
+  x = __gen_ocl_fabs(x);
+  p = __gen_ocl_fabs(p);
+  if (hp<0x01000000) {
+    if(x+x>p) {
+      x-=p;
+      if(x+x>=p) x -= p;
+    }
+  } else {
+    p_half = (float)0.5*p;
+    if(x>p_half) {
+      x-=p;
+      if(x>=p_half) x -= p;
+    }
+  }
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_SET_FLOAT_WORD(x,hx^sx);
+  return x;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
+  if(!__ocl_finitef(x)||x==(float)0.0) return x;
+  x = __gen_ocl_scalbnf(x,n);
+  return x;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
+  //return 0.5f * native_sqrt((1 + x) / (1 - x));
+  float xa = __gen_ocl_fabs (x);
+  float t;
+  if (isless (xa, 0.5f)){
+    if (xa < 0x1.0p-28f) return x;
+    t = xa + xa;
+    t = 0.5f * log1p (t + t * xa / (1.0f - xa));
+  } else if (isless (xa, 1.0f)){
+    t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
+  } else{
+    if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
+    return x / 0.0f;
+  }
+  return __gen_ocl_internal_copysign(t, x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_exp10(float x){
+  float px, qx,ans;
+  short n;
+  int i;
+  float*p;
+  float MAXL10 = 38.230809449325611792;
+  float LOG210 = 3.32192809488736234787e0;
+  float LG102A = 3.00781250000000000000E-1;
+  float LG102B = 2.48745663981195213739E-4;
+  float P[6];
+  P[0] = 2.063216740311022E-001;
+  P[1] = 5.420251702225484E-001;
+  P[2] = 1.171292686296281E+000;
+  P[3] = 2.034649854009453E+000;
+  P[4] = 2.650948748208892E+000;
+  P[5] = 2.302585167056758E+000;
+  if( isinf(x))
+    return INFINITY;
+
+  if( x < -MAXL10 )return 0.0;
+  /* The following is necessary because range reduction blows up: */
+  if( x == 0 )return 1.0;
+
+  /* Express 10**x = 10**g 2**n
+    *	 = 10**g 10**( n log10(2) )
+    *	 = 10**( g + n log10(2) )
+    */
+  px = x * LOG210;
+  qx = __gen_ocl_internal_floor( px + 0.5 );
+  n = qx;
+  x -= qx * LG102A;
+  x -= qx * LG102B;
+
+  /* rational approximation for exponential
+    * of the fractional part:
+    * 10**x - 1  =  2x P(x**2)/( Q(x**2) - P(x**2) )
+    */
+  p = P;
+  ans = *p++;
+  i = 5;
+  do{
+    ans = ans * x  +  *p++;
+  }
+  while( --i );
+  px = 1.0 + x * ans;
+
+  /* multiply by power of 2 */
+  x = __gen_ocl_internal_ldexp( px, n );
+  return x;
+}
+
+// TODO use llvm intrinsics definitions
+#define cospi __gen_ocl_internal_cospi
+#define cosh __gen_ocl_internal_cosh
+#define acos __gen_ocl_internal_acos
+#define acospi __gen_ocl_internal_acospi
+#define acosh __gen_ocl_internal_acosh
+#define sinpi __gen_ocl_internal_sinpi
+#define sinh __gen_ocl_internal_sinh
+#define asin __gen_ocl_internal_asin
+#define asinpi __gen_ocl_internal_asinpi
+#define asinh __gen_ocl_internal_asinh
+#define tanpi __gen_ocl_internal_tanpi
+#define tanh __gen_ocl_internal_tanh
+#define atan __gen_ocl_internal_atan
+#define atan2 __gen_ocl_internal_atan2
+#define atan2pi __gen_ocl_internal_atan2pi
+#define atanpi __gen_ocl_internal_atanpi
+#define atanh __gen_ocl_internal_atanh
+#define pow powr
+#define cbrt __gen_ocl_internal_cbrt
+#define rint __gen_ocl_internal_rint
+#define copysign __gen_ocl_internal_copysign
+#define erf __gen_ocl_internal_erf
+#define erfc __gen_ocl_internal_erfc
+#define fmod __gen_ocl_internal_fmod
+#define remainder __gen_ocl_internal_remainder
+#define ldexp __gen_ocl_internal_ldexp
+PURE CONST float __gen_ocl_mad(float a, float b, float c);
+PURE CONST float __gen_ocl_fmax(float a, float b);
+PURE CONST float __gen_ocl_fmin(float a, float b);
+INLINE_OVERLOADABLE float mad(float a, float b, float c) {
+  return __gen_ocl_mad(a, b, c);
+}
+
+#define DEF(TYPE1, TYPE2) \
+  INLINE_OVERLOADABLE TYPE1 select(TYPE1 src0, TYPE1 src1, TYPE2 cond) { \
+    return cond ? src1 : src0; \
+  }
+DEF(char, char)
+DEF(char, uchar)
+DEF(uchar, char)
+DEF(uchar, uchar)
+DEF(short, short)
+DEF(short, ushort)
+DEF(ushort, short)
+DEF(ushort, ushort)
+DEF(int, int)
+DEF(int, uint)
+DEF(uint, int)
+DEF(uint, uint)
+DEF(long, long)
+DEF(long, ulong)
+DEF(ulong, long)
+DEF(ulong, ulong)
+DEF(float, int)
+DEF(float, uint)
+#undef DEF
+
+/////////////////////////////////////////////////////////////////////////////
+// Common Functions (see 6.11.4 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+INLINE_OVERLOADABLE float step(float edge, float x) {
+  return x < edge ? 0.0 : 1.0;
+}
+
+#define DECL_MIN_MAX_CLAMP(TYPE) \
+INLINE_OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
+  return a > b ? a : b; \
+} \
+INLINE_OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
+  return a < b ? a : b; \
+} \
+INLINE_OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
+  return max(min(v, u), l); \
+}
+DECL_MIN_MAX_CLAMP(int)
+DECL_MIN_MAX_CLAMP(short)
+DECL_MIN_MAX_CLAMP(char)
+DECL_MIN_MAX_CLAMP(uint)
+DECL_MIN_MAX_CLAMP(unsigned short)
+DECL_MIN_MAX_CLAMP(unsigned char)
+DECL_MIN_MAX_CLAMP(long)
+DECL_MIN_MAX_CLAMP(ulong)
+#undef DECL_MIN_MAX_CLAMP
+INLINE_OVERLOADABLE float max(float a, float b) {
+  return __gen_ocl_fmax(a, b);
+}
+INLINE_OVERLOADABLE float min(float a, float b) {
+  return __gen_ocl_fmin(a, b);
+}
+INLINE_OVERLOADABLE float clamp(float v, float l, float u) {
+  return max(min(v, u), l);
+}
+
+#define BODY \
+  if (isnan(x) || isinf(x)) { \
+    *exp = 0; \
+    return x; \
+  } \
+  uint u = as_uint(x); \
+  uint a = u & 0x7FFFFFFFu; \
+  if (a == 0) { \
+    *exp = 0; \
+    return x; \
+  } \
+  if (a >= 0x800000) { \
+    *exp = (a >> 23) - 126; \
+    return as_float((u & (0x807FFFFFu)) | 0x3F000000); \
+  } \
+  int e = -126; \
+  while (a < 0x400000) { \
+    e --; \
+    a <<= 1; \
+  } \
+  a <<= 1; \
+  *exp = e; \
+  return as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
+INLINE_OVERLOADABLE float frexp(float x, global int *exp) { BODY; }
+INLINE_OVERLOADABLE float frexp(float x, local int *exp) { BODY; }
+INLINE_OVERLOADABLE float frexp(float x, private int *exp) { BODY; }
+#undef BODY
+
+INLINE_OVERLOADABLE float nextafter(float x, float y) {
+  int hx, hy, ix, iy;
+  hx = as_int(x);
+  hy = as_int(y);
+  ix = hx & 0x7fffffff;
+  iy = hy & 0x7fffffff;
+  if(ix>0x7f800000 || iy>0x7f800000)
+    return x+y;
+  if(hx == hy)
+    return y;
+  if(ix == 0) {
+    if(iy == 0)
+      return y;
+    else
+      return as_float((hy&0x80000000) | 1);
+  }
+  if(hx >= 0) {
+    if(hx > hy) {
+      hx -= 1;
+    } else {
+      hx += 1;
+    }
+  } else {
+    if(hy >= 0 || hx > hy){
+      hx -= 1;
+    } else {
+      hx += 1;
+    }
+  }
+  return as_float(hx);
+}
+
+#define BODY \
+  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF; \
+  if (ix > 0x7F800000) { \
+    *i = nan(0u); \
+    return nan(0u); \
+  } \
+  if (ix == 0x7F800000) { \
+    *i = x; \
+    return as_float(hx & 0x80000000u); \
+  } \
+  *i = __gen_ocl_rndz(x); \
+  return x - *i;
+INLINE_OVERLOADABLE float modf(float x, global float *i) { BODY; }
+INLINE_OVERLOADABLE float modf(float x, local float *i) { BODY; }
+INLINE_OVERLOADABLE float modf(float x, private float *i) { BODY; }
+#undef BODY
+INLINE_OVERLOADABLE float degrees(float radians) { return (180 / M_PI_F) * radians; }
+INLINE_OVERLOADABLE float radians(float degrees) { return (M_PI_F / 180) * degrees; }
+
+INLINE_OVERLOADABLE float smoothstep(float e0, float e1, float x) {
+  x = clamp((x - e0) / (e1 - e0), 0.f, 1.f);
+  return x * x * (3 - 2 * x);
+}
+
+INLINE_OVERLOADABLE float sign(float x) {
+  if(x > 0)
+    return 1;
+  if(x < 0)
+    return -1;
+  if(x == -0.f)
+    return -0.f;
+  return 0.f;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
+INLINE_OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
+  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+  return a > b ? x : b > a ? y : max(x, y);
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
+  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+  return a < b ? x : b < a ? y : min(x, y);
+}
+INLINE_OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
+INLINE_OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
+  if(isnan(x))
+    return x;
+  if(isnan(y))
+    return y;
+  return x > y ? (x - y) : +0.f;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
+  float z,ax,z_h,z_l,p_h,p_l;
+  float y1,t1,t2,r,s,sn,t,u,v,w;
+  int i,j,k,yisint,n;
+  int hx,hy,ix,iy,is;
+  float bp[2],dp_h[2],dp_l[2],
+  zero    =  0.0,
+  one	=  1.0,
+  two	=  2.0,
+  two24	=  16777216.0,	/* 0x4b800000 */
+  huge	=  1.0e30,
+  tiny    =  1.0e-30,
+  /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
+  L1  =  6.0000002384e-01, /* 0x3f19999a */
+  L2  =  4.2857143283e-01, /* 0x3edb6db7 */
+  L3  =  3.3333334327e-01, /* 0x3eaaaaab */
+  L4  =  2.7272811532e-01, /* 0x3e8ba305 */
+  L5  =  2.3066075146e-01, /* 0x3e6c3255 */
+  L6  =  2.0697501302e-01, /* 0x3e53f142 */
+  P1   =  1.6666667163e-01, /* 0x3e2aaaab */
+  P2   = -2.7777778450e-03, /* 0xbb360b61 */
+  P3   =  6.6137559770e-05, /* 0x388ab355 */
+  P4   = -1.6533901999e-06, /* 0xb5ddea0e */
+  P5   =  4.1381369442e-08, /* 0x3331bb4c */
+  lg2  =  6.9314718246e-01, /* 0x3f317218 */
+  lg2_h  =  6.93145752e-01, /* 0x3f317200 */
+  lg2_l  =  1.42860654e-06, /* 0x35bfbe8c */
+  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
+  cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
+  cp_h  =  9.6179199219e-01, /* 0x3f763800 =head of cp */
+  cp_l  =  4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
+  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  ivln2_h  =  1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
+  ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
+  bp[0] = 1.0,bp[1] = 1.5,
+  dp_h[0] = 0.0,dp_h[1] = 5.84960938e-01,
+  dp_l[0] = 0.0,dp_l[1] = 1.56322085e-06;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_GET_FLOAT_WORD(hy,y);
+  ix = hx&0x7fffffff;  iy = hy&0x7fffffff;
+  if (ix < 0x00800000) {	   /* x < 2**-126  */
+    ix = 0;/* Gen does not support subnormal number now */
+  }
+  if (iy < 0x00800000) {	  /* y < 2**-126  */
+    iy = 0;/* Gen does not support subnormal number now */
+  }
+   /* y==zero: x**0 = 1 */
+  if(iy==0) return one;
+  if(hx==0x3f800000) return one;
+  /* +-NaN return x+y */
+  if(ix > 0x7f800000 || iy > 0x7f800000)
+    return (x+0.0f)+y+(0.0f);
+  /* determine if y is an odd int when x < 0
+     * yisint = 0	... y is not an integer
+     * yisint = 1	... y is an odd int
+     * yisint = 2	... y is an even int
+     */
+  yisint  = 0;
+  if(hx<0) {
+    if(iy>=0x4b800000) yisint = 2; /* even integer y */
+    else if(iy>=0x3f800000) {
+      k = (iy>>23)-0x7f;	   /* exponent */
+      j = iy>>(23-k);
+      if((j<<(23-k))==iy) yisint = 2-(j&1);
+    }
+  }
+  /* special value of y */
+  if (iy==0x7f800000) {	/* y is +-inf */
+    if (ix==0x3f800000)
+      //return  y - y;	/* inf**+-1 is NaN */
+      return one;
+    else if (ix > 0x3f800000)/* (|x|>1)**+-inf = inf,0 */
+      return (hy>=0)? y: zero;
+    else			/* (|x|<1)**-,+inf = inf,0 */
+      return (hy<0)?-y: zero;
+  }
+  if(iy==0x3f800000) {	/* y is  +-1 */
+    if(hy<0) return one/x; else return x;
+  }
+  if(hy==0x40000000) return x*x; /* y is  2 */
+  if(hy==0x3f000000) {	/* y is  0.5 */
+    if(hx>=0)return __gen_ocl_sqrt(x);
+  }
+
+  ax   = __gen_ocl_fabs(x);
+    /* special value of x */
+  if(ix==0x7f800000||ix==0||ix==0x3f800000){
+    z = ax;			/*x is +-0,+-inf,+-1*/
+    if(hy<0) z = one/z;	/* z = (1/|x|) */
+    if(hx<0) {
+      if(((ix-0x3f800000)|yisint)==0) {
+        z = (z-z)/(z-z); /* (-1)**non-int is NaN */
+      } else if(yisint==1)
+        z = -z;		/* (x<0)**odd = -(|x|**odd) */
+    }
+    return z;
+  }
+  n = ((uint)hx>>31)-1;
+
+  /* (x<0)**(non-int) is NaN */
+  if((n|yisint)==0) return (x-x)/(x-x);
+
+  sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
+  if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
+
+  /* |y| is huge */
+  if(iy>0x4d000000) { /* if |y| > 2**27 */
+    /* over/underflow if x is not close to one */
+    if(ix<0x3f7ffff8) return (hy<0)? sn*huge*huge:sn*tiny*tiny;
+    if(ix>0x3f800007) return (hy>0)? sn*huge*huge:sn*tiny*tiny;
+    /* now |1-x| is tiny <= 2**-20, suffice to compute
+          log(x) by x-x^2/2+x^3/3-x^4/4 */
+    t = ax-1;		/* t has 20 trailing zeros */
+    w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
+    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
+    v = t*ivln2_l-w*ivln2;
+    t1 = u+v;
+    GEN_OCL_GET_FLOAT_WORD(is,t1);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+    t2 = v-(t1-u);
+  } else {
+    float s2,s_h,s_l,t_h,t_l;
+    n = 0;
+	/* take care subnormal number */
+    //if(ix<0x00800000)
+      //{ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
+    n  += ((ix)>>23)-0x7f;
+    j  = ix&0x007fffff;
+	/* determine interval */
+    ix = j|0x3f800000;		/* normalize ix */
+    if(j<=0x1cc471) k=0;	/* |x|<sqrt(3/2) */
+    else if(j<0x5db3d7) k=1;	/* |x|<sqrt(3)   */
+    else {k=0;n+=1;ix -= 0x00800000;}
+    GEN_OCL_SET_FLOAT_WORD(ax,ix);
+
+	/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+    u = ax-bp[k];		/* bp[0]=1.0, bp[1]=1.5 */
+    v = one/(ax+bp[k]);
+    s = u*v;
+    s_h = s;
+    GEN_OCL_GET_FLOAT_WORD(is,s_h);
+    GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
+    /* t_h=ax+bp[k] High */
+    is = ((ix>>1)&0xfffff000)|0x20000000;
+    GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
+    t_l = ax - (t_h-bp[k]);
+    s_l = v*((u-s_h*t_h)-s_h*t_l);
+    /* compute log(ax) */
+    s2 = s*s;
+    r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
+    r += s_l*(s_h+s);
+    s2  = s_h*s_h;
+    t_h = 3.0f+s2+r;
+    GEN_OCL_GET_FLOAT_WORD(is,t_h);
+    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xfffff000);
+    t_l = r-((t_h-3.0f)-s2);
+    /* u+v = s*(1+...) */
+    u = s_h*t_h;
+    v = s_l*t_h+t_l*s;
+    /* 2/(3log2)*(s+...) */
+    p_h = u+v;
+    GEN_OCL_GET_FLOAT_WORD(is,p_h);
+    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xfffff000);
+    p_l = v-(p_h-u);
+    z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
+    z_l = cp_l*p_h+p_l*cp+dp_l[k];
+    /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+    t = (float)n;
+    t1 = (((z_h+z_l)+dp_h[k])+t);
+    GEN_OCL_GET_FLOAT_WORD(is,t1);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
+  }
+
+  /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
+  GEN_OCL_GET_FLOAT_WORD(is,y);
+  GEN_OCL_SET_FLOAT_WORD(y1,is&0xfffff000);
+  p_l = (y-y1)*t1+y*t2;
+  p_h = y1*t1;
+  z = p_l+p_h;
+  GEN_OCL_GET_FLOAT_WORD(j,z);
+  if (j>0x43000000)				/* if z > 128 */
+    return sn*huge*huge;			/* overflow */
+  else if (j==0x43000000) {			/* if z == 128 */
+    if(p_l+ovt>z-p_h) return sn*huge*huge;	/* overflow */
+  }
+  else if ((j&0x7fffffff)>0x43160000)		/* z <= -150 */
+    return sn*tiny*tiny;			/* underflow */
+  else if (j==0xc3160000){			/* z == -150 */
+    if(p_l<=z-p_h) return sn*tiny*tiny;		/* underflow */
+  }
+
+  /*
+    * compute 2**(p_h+p_l)
+    */
+  i = j&0x7fffffff;
+  k = (i>>23)-0x7f;
+  n = 0;
+  if(i>0x3f000000) {		/* if |z| > 0.5, set n = [z+0.5] */
+    n = j+(0x00800000>>(k+1));
+    k = ((n&0x7fffffff)>>23)-0x7f;	/* new k for n */
+    GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
+    n = ((n&0x007fffff)|0x00800000)>>(23-k);
+    if(j<0) n = -n;
+    p_h -= t;
+  }
+  t = p_l+p_h;
+  GEN_OCL_GET_FLOAT_WORD(is,t);
+  GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000);
+  u = t*lg2_h;
+  v = (p_l-(t-p_h))*lg2+t*lg2_l;
+  z = u+v;
+  w = v-(z-u);
+  t  = z*z;
+  t1  = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+  r  = (z*t1)/(t1-two)-(w+z*w);
+  z  = one-(r-z);
+  GEN_OCL_GET_FLOAT_WORD(j,z);
+  j += (n<<23);
+  if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n);	/* subnormal output */
+  else GEN_OCL_SET_FLOAT_WORD(z,j);
+  return sn*z;
+}
+
+
+INLINE_OVERLOADABLE float hypot(float x, float y) {
+  //return __gen_ocl_sqrt(x*x + y*y);
+  float a,b,an,bn,cn;
+  int e;
+  if (isfinite (x) && isfinite (y)){      /* Determine absolute values.  */
+  x = __gen_ocl_fabs (x);
+  y = __gen_ocl_fabs (y);
+  /* Find the bigger and the smaller one.  */
+  a = max(x,y);
+  b = min(x,y);
+  /* Now 0 <= b <= a.  */
+  /* Write a = an * 2^e, b = bn * 2^e with 0 <= bn <= an < 1.  */
+  an = frexp (a, &e);
+  bn = ldexp (b, - e);
+  /* Through the normalization, no unneeded overflow or underflow will occur here.  */
+  cn = __gen_ocl_sqrt (an * an + bn * bn);
+  return ldexp (cn, e);
+  }else{
+    if (isinf (x) || isinf (y))  /* x or y is infinite.  Return +Infinity.  */    
+      return INFINITY;
+    else        /* x or y is NaN.  Return NaN.  */
+      return x + y;
+  }
+}
+
+#define BODY \
+  if (isnan(x)) { \
+    *p = x; \
+    return x; \
+  } \
+  *p = __gen_ocl_internal_floor(x); \
+  if (isinf(x)) { \
+    return x > 0 ? +0. : -0.; \
+  } \
+  return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
+INLINE_OVERLOADABLE float fract(float x, global float *p) { BODY; }
+INLINE_OVERLOADABLE float fract(float x, local float *p) { BODY; }
+INLINE_OVERLOADABLE float fract(float x, private float *p) { BODY; }
+#undef BODY
+
+#define BODY \
+  float Zero[2]; \
+  int n,hx,hy,hz,ix,iy,sx,i,sy; \
+  uint q,sxy; \
+  Zero[0] = 0.0;Zero[1] = -0.0; \
+  GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_GET_FLOAT_WORD(hy,y); \
+  sxy = (hx ^ hy) & 0x80000000;sx = hx&0x80000000;sy = hy&0x80000000; \
+  hx ^=sx; hy &= 0x7fffffff; \
+  if (hx < 0x00800000)hx = 0;if (hy < 0x00800000)hy = 0; \
+  if(hy==0||hx>=0x7f800000||hy>0x7f800000){ \
+    *quo = 0;return NAN; \
+  } \
+  if( hy == 0x7F800000 || hx == 0 ) { \
+    *quo = 0;return x; \
+  } \
+  if( hx == hy ) { \
+    *quo = (x == y) ? 1 : -1; \
+    return sx ? -0.0 : 0.0; \
+  } \
+  if(hx<hy) { \
+    q = 0; \
+    goto fixup; \
+  } else if(hx==hy) { \
+    *quo = (sxy ? -1 : 1); \
+    return Zero[(uint)sx>>31]; \
+  } \
+  ix = (hx>>23)-127; \
+  iy = (hy>>23)-127; \
+  hx = 0x00800000|(0x007fffff&hx); \
+  hy = 0x00800000|(0x007fffff&hy); \
+  n = ix - iy; \
+  q = 0; \
+  while(n--) { \
+    hz=hx-hy; \
+    if(hz<0) hx = hx << 1; \
+    else {hx = hz << 1; q++;} \
+    q <<= 1; \
+  } \
+  hz=hx-hy; \
+  if(hz>=0) {hx=hz;q++;} \
+  if(hx==0) { \
+    q &= 0x0000007f; \
+    *quo = (sxy ? -q : q); \
+    return Zero[(uint)sx>>31]; \
+  } \
+  while(hx<0x00800000) { \
+    hx <<= 1;iy -= 1; \
+  } \
+  if(iy>= -126) { \
+    hx = ((hx-0x00800000)|((iy+127)<<23)); \
+  } else {\
+    n = -126 - iy; \
+    hx >>= n; \
+  } \
+fixup: \
+  GEN_OCL_SET_FLOAT_WORD(x,hx); \
+  if(hx<0x00800000){ \
+    GEN_OCL_GET_FLOAT_WORD(hy,y); \
+    hy &= 0x7fffffff; \
+    if(hx+hx > hy ||(hx+hx==hy && (q & 1)))q++; \
+    x = 0; \
+  }else{ \
+    y = __gen_ocl_fabs(y); \
+    if (y < 0x1p-125f) { \
+      if (x+x>y || (x+x==y && (q & 1))) { \
+        q++;x-=y; \
+      } \
+    }else if (x>0.5f*y || (x==0.5f*y && (q & 1))) { \
+      q++;x-=y; \
+    } \
+    GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_SET_FLOAT_WORD(x,hx^sx); \
+  } \
+  int sign = sx==sy?0:1; \
+  q &= 0x0000007f; \
+  *quo = (sign ? -q : q); \
+  return x;
+
+INLINE_OVERLOADABLE float remquo(float x, float y, global int *quo) {
+	BODY;
+}
+INLINE_OVERLOADABLE float remquo(float x, float y, local int *quo) { BODY; }
+INLINE_OVERLOADABLE float remquo(float x, float y, private int *quo) { BODY; }
+#undef BODY
+INLINE_OVERLOADABLE float native_divide(float x, float y) { return x/y; }
+INLINE_OVERLOADABLE float pown(float x, int n) {
+  if (x == 0 && n == 0)
+    return 1;
+  return powr(x, n);
+}
+
+INLINE_OVERLOADABLE float internal_rootn(float x, int n, const bool isFastpath)
+{
+  float ax,re;
+  int sign = 0;
+  if( n == 0 )return NAN;
+  //rootn ( x, n )  returns a NaN for x < 0 and n is even.
+  if( x < 0 && 0 == (n&1) )
+    return NAN;
+  if( x == 0.0 ){
+    switch( n & 0x80000001 ){
+      //rootn ( +-0,  n ) is +0 for even n > 0.
+      case 0:
+        return 0.0f;
+      //rootn ( +-0,  n ) is +-0 for odd n > 0.
+      case 1:
+        return x;
+      //rootn ( +-0,  n ) is +inf for even n < 0.
+      case 0x80000000:
+        return INFINITY;
+
+      //rootn ( +-0,  n ) is +-inf for odd n < 0.
+      case 0x80000001:
+        return __gen_ocl_internal_copysign(INFINITY, x);
+    }
+  }
+  ax = __gen_ocl_fabs(x);
+  if(x <0.0f && (n&1))
+    sign = 1;
+  if (isFastpath)
+    re = __gen_ocl_pow(ax,1.f/n);
+  else
+    re = __gen_ocl_internal_pow(ax,1.f/n);
+  if(sign)
+    re = -re;
+  return re;
+}
+
+INLINE_OVERLOADABLE float rootn(float x, int n) {
+  return internal_rootn(x, n, 0);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Geometric functions (see 6.11.5 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+INLINE_OVERLOADABLE float dot(float p0, float p1) {
+  return p0 * p1;
+}
+INLINE_OVERLOADABLE float dot(float2 p0, float2 p1) {
+  return p0.x * p1.x + p0.y * p1.y;
+}
+INLINE_OVERLOADABLE float dot(float3 p0, float3 p1) {
+  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z;
+}
+INLINE_OVERLOADABLE float dot(float4 p0, float4 p1) {
+  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z + p0.w * p1.w;
+}
+INLINE_OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
+#define BODY \
+  if(m == 0) \
+    return 0; \
+  if(isinf(m)) \
+    return INFINITY; \
+  if(m < 1) \
+    m = 1; \
+  x /= m; \
+  return m * sqrt(dot(x,x));
+INLINE_OVERLOADABLE float length(float2 x) {
+  float m = max(__gen_ocl_fabs(x.s0), __gen_ocl_fabs(x.s1));
+  BODY;
+}
+INLINE_OVERLOADABLE float length(float3 x) {
+  float m = max(__gen_ocl_fabs(x.s0), max(__gen_ocl_fabs(x.s1), __gen_ocl_fabs(x.s2)));
+  BODY;
+}
+INLINE_OVERLOADABLE float length(float4 x) {
+  float m = max(__gen_ocl_fabs(x.s0), max(__gen_ocl_fabs(x.s1), max(__gen_ocl_fabs(x.s2), __gen_ocl_fabs(x.s3))));
+  BODY;
+}
+#undef BODY
+INLINE_OVERLOADABLE float distance(float x, float y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
+INLINE_OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
+INLINE_OVERLOADABLE float normalize(float x) {
+  union { float f; unsigned u; } u;
+  u.f = x;
+  if(u.u == 0)
+    return 0.f;
+  if(isnan(x))
+    return NAN;
+  return u.u < 0x7fffffff ? 1.f : -1.f;
+}
+INLINE_OVERLOADABLE float2 normalize(float2 x) {
+  float m = length(x);
+  if(m == 0)
+    return 0;
+  return x / m;
+}
+INLINE_OVERLOADABLE float3 normalize(float3 x) {
+  float m = length(x);
+  if(m == 0)
+    return 0;
+  return x / m;
+}
+INLINE_OVERLOADABLE float4 normalize(float4 x) {
+  float m = length(x);
+  if(m == 0)
+    return 0;
+  return x / m;
+}
+
+INLINE_OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }
+INLINE_OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }
+INLINE_OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }
+INLINE_OVERLOADABLE float fast_normalize(float x) { return x > 0 ? 1.f : (x < 0 ? -1.f : 0.f); }
+INLINE_OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }
+INLINE_OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }
+
+INLINE_OVERLOADABLE float3 cross(float3 v0, float3 v1) {
+   return v0.yzx*v1.zxy-v0.zxy*v1.yzx;
+}
+INLINE_OVERLOADABLE float4 cross(float4 v0, float4 v1) {
+   return (float4)(v0.yzx*v1.zxy-v0.zxy*v1.yzx, 0.f);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Vector loads and stores
+/////////////////////////////////////////////////////////////////////////////
+
+// These loads and stores will use untyped reads and writes, so we can just
+// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
+// Well we do not care, we do not activate TBAA in the compiler
+#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
+INLINE_OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+} \
+INLINE_OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
+  *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
+}
+
+#define DECL_UNTYPED_RD_SPACE_N(TYPE, DIM, SPACE) \
+INLINE_OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+}
+
+#define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+  *(p + 3 * offset) = v.s0; \
+  *(p + 3 * offset + 1) = v.s1; \
+  *(p + 3 * offset + 2) = v.s2; \
+} \
+INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
+}
+
+#define DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
+}
+
+#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
+  DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RD_ALL_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 2, SPACE) \
+  DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 4, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 8, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RW_ALL(TYPE) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \
+  DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)
+
+#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##2)(*(p+2*offset), *(p+2*offset+1)); \
+} \
+INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##3)(*(p+3*offset), *(p+3*offset+1), *(p+3*offset+2)); \
+} \
+INLINE_OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##4)(vload2(2*offset, p), vload2(2*offset, p+2)); \
+} \
+INLINE_OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##8)(vload4(2*offset, p), vload4(2*offset, p+4)); \
+} \
+INLINE_OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##16)(vload8(2*offset, p), vload8(2*offset, p+8)); \
+}
+
+#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p) {\
+  *(p + 2 * offset) = v.s0; \
+  *(p + 2 * offset + 1) = v.s1; \
+} \
+INLINE_OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+  *(p + 3 * offset) = v.s0; \
+  *(p + 3 * offset + 1) = v.s1; \
+  *(p + 3 * offset + 2) = v.s2; \
+} \
+INLINE_OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p) { \
+  vstore2(v.lo, 2*offset, p); \
+  vstore2(v.hi, 2*offset, p+2); \
+} \
+INLINE_OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p) { \
+  vstore4(v.lo, 2*offset, p); \
+  vstore4(v.hi, 2*offset, p+4); \
+} \
+INLINE_OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p) { \
+  vstore8(v.lo, 2*offset, p); \
+  vstore8(v.hi, 2*offset, p+8); \
+}
+
+#define DECL_BYTE_RW_ALL(TYPE) \
+  DECL_BYTE_RD_SPACE(TYPE, __global) \
+  DECL_BYTE_RD_SPACE(TYPE, __local) \
+  DECL_BYTE_RD_SPACE(TYPE, __private) \
+  DECL_BYTE_RD_SPACE(TYPE, __constant) \
+  DECL_BYTE_WR_SPACE(TYPE, __global) \
+  DECL_BYTE_WR_SPACE(TYPE, __local) \
+  DECL_BYTE_WR_SPACE(TYPE, __private)
+
+DECL_BYTE_RW_ALL(char)
+DECL_BYTE_RW_ALL(uchar)
+DECL_BYTE_RW_ALL(short)
+DECL_BYTE_RW_ALL(ushort)
+DECL_UNTYPED_RW_ALL(int)
+DECL_UNTYPED_RW_ALL(uint)
+DECL_UNTYPED_RW_ALL(long)
+DECL_UNTYPED_RW_ALL(ulong)
+DECL_UNTYPED_RW_ALL(float)
+DECL_UNTYPED_RW_ALL(double)
+
+#undef DECL_UNTYPED_RW_ALL
+#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_UNTYPED_RD_ALL_SPACE
+#undef DECL_UNTYPED_RW_SPACE_N
+#undef DECL_UNTYPED_RD_SPACE_N
+#undef DECL_UNTYPED_V3_SPACE
+#undef DECL_UNTYPED_RDV3_SPACE
+#undef DECL_BYTE_RD_SPACE
+#undef DECL_BYTE_WR_SPACE
+#undef DECL_BYTE_RW_ALL
+
+PURE CONST float __gen_ocl_f16to32(short h);
+PURE CONST short __gen_ocl_f32to16(float f);
+
+INLINE_OVERLOADABLE short f32to16_rtp(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (f > con)
+    return s - signbit(f) * 2 + 1;
+  else
+    return s;
+}
+
+INLINE_OVERLOADABLE short f32to16_rtn(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (con > f)
+    return s + signbit(f) * 2 - 1;
+  else
+    return s;
+}
+
+INLINE_OVERLOADABLE short f32to16_rtz(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (((con > f) && !signbit(f)) ||
+      ((con < f) && signbit(f)))
+    return s - 1;
+  else
+    return s;
+}
+
+#define DECL_HALF_LD_SPACE(SPACE) \
+INLINE_OVERLOADABLE float vload_half(size_t offset, const SPACE half *p) { \
+  return __gen_ocl_f16to32(*(SPACE short *)(p + offset)); \
+} \
+INLINE_OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p) { \
+  return (float2)(vload_half(offset*2, p), \
+                  vload_half(offset*2 + 1, p)); \
+} \
+INLINE_OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p) { \
+  return (float3)(vload_half(offset*3, p), \
+                  vload_half(offset*3 + 1, p), \
+                  vload_half(offset*3 + 2, p)); \
+} \
+INLINE_OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p) { \
+  return (float3)(vload_half(offset*4, p), \
+                  vload_half(offset*4 + 1, p), \
+                  vload_half(offset*4 + 2, p)); \
+} \
+INLINE_OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p) { \
+  return (float4)(vload_half2(offset*2, p), \
+                  vload_half2(offset*2 + 1, p)); \
+} \
+INLINE_OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p) { \
+  return (float8)(vload_half4(offset*2, p), \
+                  vload_half4(offset*2 + 1, p)); \
+} \
+INLINE_OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p) { \
+  return (float16)(vload_half8(offset*2, p), \
+                   vload_half8(offset*2 + 1, p)); \
+}
+
+#define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \
+INLINE_OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p) { \
+  *(SPACE short *)(p + offset) = FUNC(data); \
+} \
+INLINE_OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data.lo, offset*2, p); \
+  vstore_half##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
+  vstore_half2##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data.s0, offset*3, p); \
+  vstore_half##ROUND(data.s1, offset*3 + 1, p); \
+  vstore_half##ROUND(data.s2, offset*3 + 2, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data.s0, offset*4, p); \
+  vstore_half##ROUND(data.s1, offset*4 + 1, p); \
+  vstore_half##ROUND(data.s2, offset*4 + 2, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
+  vstore_half2##ROUND(data.lo, offset*2, p); \
+  vstore_half2##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
+  vstore_half4##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
+  vstore_half4##ROUND(data.lo, offset*2, p); \
+  vstore_half4##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
+  vstore_half8##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
+  vstore_half8##ROUND(data.lo, offset*2, p); \
+  vstore_half8##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
+  vstore_half16##ROUND(data, offset, p); \
+}
+
+#define DECL_HALF_ST_SPACE(SPACE) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE,  , __gen_ocl_f32to16) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rte, __gen_ocl_f32to16) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtz, f32to16_rtz) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtp, f32to16_rtp) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtn, f32to16_rtn) \
+
+DECL_HALF_LD_SPACE(__global)
+DECL_HALF_LD_SPACE(__local)
+DECL_HALF_LD_SPACE(__constant)
+DECL_HALF_LD_SPACE(__private)
+
+DECL_HALF_ST_SPACE(__global)
+DECL_HALF_ST_SPACE(__local)
+DECL_HALF_ST_SPACE(__private)
+
+//#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_HALF_LD_SPACE
+#undef DECL_HALF_ST_SPACE
+#undef DECL_HALF_ST_SPACE_ROUND
+
+#define vloada_half vload_half
+#define vloada_half2 vload_half2
+#define vloada_half4 vload_half4
+#define vloada_half8 vload_half8
+#define vloada_half16 vload_half16
+
+// XXX workaround ptx profile
+#define fabs __gen_ocl_internal_fabs
+#define trunc __gen_ocl_internal_trunc
+#define round __gen_ocl_internal_round
+#define floor __gen_ocl_internal_floor
+#define ceil __gen_ocl_internal_ceil
+#define log __gen_ocl_internal_log
+#define log2 __gen_ocl_internal_log2
+#define log10 __gen_ocl_internal_log10
+#define exp __gen_ocl_internal_exp
+#define exp2 native_exp2
+#define exp10 __gen_ocl_internal_exp10
+#define expm1 __gen_ocl_internal_expm1
+#define fmin __gen_ocl_internal_fmin
+#define fmax __gen_ocl_internal_fmax
+#define fma mad
+#define fdim __gen_ocl_internal_fdim
+#define maxmag __gen_ocl_internal_maxmag
+#define minmag __gen_ocl_internal_minmag
+
+/////////////////////////////////////////////////////////////////////////////
+// Miscellaneous Vector Functions (see 6.11.12 of OCL 1.1 spec)
+/////////////////////////////////////////////////////////////////////////////
+#define DEC2(TYPE, XTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##2 shuffle(XTYPE x, MASKTYPE##2 mask) { \
+    TYPE##2 y; \
+    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+    return y; \
+  }
+
+#define DEC4(TYPE, XTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##4 shuffle(XTYPE x, MASKTYPE##4 mask) { \
+    TYPE##4 y; \
+    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+    y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
+    y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
+    return y; \
+  }
+
+#define DEC8(TYPE, XTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##8 shuffle(XTYPE x, MASKTYPE##8 mask) { \
+    TYPE##8 y; \
+    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+    y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
+    y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
+    y.s4 = ((TYPE *) &x)[mask.s4 & (vec_step(x) - 1)]; \
+    y.s5 = ((TYPE *) &x)[mask.s5 & (vec_step(x) - 1)]; \
+    y.s6 = ((TYPE *) &x)[mask.s6 & (vec_step(x) - 1)]; \
+    y.s7 = ((TYPE *) &x)[mask.s7 & (vec_step(x) - 1)]; \
+    return y; \
+  }
+
+#define DEC16(TYPE, XTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##16 shuffle(XTYPE x, MASKTYPE##16 mask) { \
+    TYPE##16 y; \
+    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+    y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
+    y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
+    y.s4 = ((TYPE *) &x)[mask.s4 & (vec_step(x) - 1)]; \
+    y.s5 = ((TYPE *) &x)[mask.s5 & (vec_step(x) - 1)]; \
+    y.s6 = ((TYPE *) &x)[mask.s6 & (vec_step(x) - 1)]; \
+    y.s7 = ((TYPE *) &x)[mask.s7 & (vec_step(x) - 1)]; \
+    y.s8 = ((TYPE *) &x)[mask.s8 & (vec_step(x) - 1)]; \
+    y.s9 = ((TYPE *) &x)[mask.s9 & (vec_step(x) - 1)]; \
+    y.sa = ((TYPE *) &x)[mask.sa & (vec_step(x) - 1)]; \
+    y.sb = ((TYPE *) &x)[mask.sb & (vec_step(x) - 1)]; \
+    y.sc = ((TYPE *) &x)[mask.sc & (vec_step(x) - 1)]; \
+    y.sd = ((TYPE *) &x)[mask.sd & (vec_step(x) - 1)]; \
+    y.se = ((TYPE *) &x)[mask.se & (vec_step(x) - 1)]; \
+    y.sf = ((TYPE *) &x)[mask.sf & (vec_step(x) - 1)]; \
+    return y; \
+  }
+
+#define DEFMASK(TYPE, MASKTYPE) \
+  DEC2(TYPE, TYPE##2, MASKTYPE); DEC2(TYPE, TYPE##4, MASKTYPE); DEC2(TYPE, TYPE##8, MASKTYPE); DEC2(TYPE, TYPE##16, MASKTYPE) \
+  DEC4(TYPE, TYPE##2, MASKTYPE); DEC4(TYPE, TYPE##4, MASKTYPE); DEC4(TYPE, TYPE##8, MASKTYPE); DEC4(TYPE, TYPE##16, MASKTYPE) \
+  DEC8(TYPE, TYPE##2, MASKTYPE); DEC8(TYPE, TYPE##4, MASKTYPE); DEC8(TYPE, TYPE##8, MASKTYPE); DEC8(TYPE, TYPE##16, MASKTYPE) \
+  DEC16(TYPE, TYPE##2, MASKTYPE); DEC16(TYPE, TYPE##4, MASKTYPE); DEC16(TYPE, TYPE##8, MASKTYPE); DEC16(TYPE, TYPE##16, MASKTYPE)
+
+#define DEF(TYPE) \
+  DEFMASK(TYPE, uchar) \
+  DEFMASK(TYPE, ushort) \
+  DEFMASK(TYPE, uint) \
+  DEFMASK(TYPE, ulong)
+
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+DEF(long)
+DEF(ulong)
+#undef DEF
+#undef DEFMASK
+#undef DEC2
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+#define DEC2(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##2 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##2 mask) { \
+    return shuffle((TEMPTYPE)(x, y), mask); \
+  }
+
+#define DEC2X(TYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##2 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##2 mask) { \
+    TYPE##2 z; \
+    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+    return z; \
+  }
+
+#define DEC4(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##4 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##4 mask) { \
+    return shuffle((TEMPTYPE)(x, y), mask); \
+  }
+
+#define DEC4X(TYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##4 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##4 mask) { \
+    TYPE##4 z; \
+    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+    return z; \
+  }
+
+#define DEC8(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##8 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##8 mask) { \
+    return shuffle((TEMPTYPE)(x, y), mask); \
+  }
+
+#define DEC8X(TYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##8 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##8 mask) { \
+    TYPE##8 z; \
+    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+    z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
+    z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
+    z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
+    z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
+    return z; \
+  }
+
+#define DEC16(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##16 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##16 mask) { \
+    return shuffle((TEMPTYPE)(x, y), mask); \
+  }
+
+#define DEC16X(TYPE, MASKTYPE) \
+  INLINE_OVERLOADABLE TYPE##16 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##16 mask) { \
+    TYPE##16 z; \
+    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+    z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
+    z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
+    z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
+    z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
+    z.s8 = mask.s8 < 16 ? ((TYPE *)&x)[mask.s8] : ((TYPE *)&y)[mask.s8 & 15]; \
+    z.s9 = mask.s9 < 16 ? ((TYPE *)&x)[mask.s9] : ((TYPE *)&y)[mask.s9 & 15]; \
+    z.sa = mask.sa < 16 ? ((TYPE *)&x)[mask.sa] : ((TYPE *)&y)[mask.sa & 15]; \
+    z.sb = mask.sb < 16 ? ((TYPE *)&x)[mask.sb] : ((TYPE *)&y)[mask.sb & 15]; \
+    z.sc = mask.sc < 16 ? ((TYPE *)&x)[mask.sc] : ((TYPE *)&y)[mask.sc & 15]; \
+    z.sd = mask.sd < 16 ? ((TYPE *)&x)[mask.sd] : ((TYPE *)&y)[mask.sd & 15]; \
+    z.se = mask.se < 16 ? ((TYPE *)&x)[mask.se] : ((TYPE *)&y)[mask.se & 15]; \
+    z.sf = mask.sf < 16 ? ((TYPE *)&x)[mask.sf] : ((TYPE *)&y)[mask.sf & 15]; \
+    return z; \
+  }
+
+#define DEFMASK(TYPE, MASKTYPE) \
+  DEC2(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC2(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC2(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC2X(TYPE, MASKTYPE) \
+  DEC4(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC4(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC4(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC4X(TYPE, MASKTYPE) \
+  DEC8(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC8(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC8(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC8X(TYPE, MASKTYPE) \
+  DEC16(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC16(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC16(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC16X(TYPE, MASKTYPE)
+
+#define DEF(TYPE) \
+  DEFMASK(TYPE, uchar) \
+  DEFMASK(TYPE, ushort) \
+  DEFMASK(TYPE, uint) \
+  DEFMASK(TYPE, ulong)
+
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+DEF(long)
+DEF(ulong)
+#undef DEF
+#undef DEFMASK
+#undef DEC2
+#undef DEC2X
+#undef DEC4
+#undef DEC4X
+#undef DEC8
+#undef DEC8X
+#undef DEC16
+#undef DEC16X
+/////////////////////////////////////////////////////////////////////////////
+// Synchronization functions
+/////////////////////////////////////////////////////////////////////////////
+#define CLK_LOCAL_MEM_FENCE  (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+void __gen_ocl_barrier_local(void);
+void __gen_ocl_barrier_global(void);
+void __gen_ocl_barrier_local_and_global(void);
+
+typedef uint cl_mem_fence_flags;
+void barrier(cl_mem_fence_flags flags);
+
+INLINE void mem_fence(cl_mem_fence_flags flags) {
+}
+INLINE void read_mem_fence(cl_mem_fence_flags flags) {
+}
+INLINE void write_mem_fence(cl_mem_fence_flags flags) {
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// Async Copies and prefetch
+/////////////////////////////////////////////////////////////////////////////
+#define BODY(SRC_STRIDE, DST_STRIDE) \
+  uint size = get_local_size(2) * get_local_size(1) * get_local_size(0); \
+  uint count = num / size;  \
+  uint offset = get_local_id(2) * get_local_size(1) + get_local_id(1);  \
+  offset = offset * get_local_size(0) + get_local_id(0); \
+  for(uint i=0; i<count; i+=1) { \
+    *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
+    offset += size;                                 \
+  } \
+  if(offset < num) \
+    *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
+  return 0;
+
+#define DEFN(TYPE) \
+INLINE_OVERLOADABLE event_t async_work_group_copy (local TYPE *dst,  const global TYPE *src, \
+										    size_t num, event_t event) { \
+  BODY(1, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_copy (global TYPE *dst,  const local TYPE *src, \
+										    size_t num, event_t event) { \
+  BODY(1, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_strided_copy (local TYPE *dst,  const global TYPE *src, \
+										            size_t num, size_t src_stride, event_t event) { \
+  BODY(src_stride, 1); \
+} \
+INLINE_OVERLOADABLE event_t async_work_group_strided_copy (global TYPE *dst,  const local TYPE *src, \
+										            size_t num, size_t dst_stride, event_t event) { \
+  BODY(1, dst_stride); \
+}
+#define DEF(TYPE) \
+  DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16);
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(long)
+DEF(ulong)
+DEF(float)
+DEF(double)
+#undef BODY
+#undef DEFN
+#undef DEF
+
+INLINE void wait_group_events (int num_events, event_t *event_list) {
+  barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+}
+
+#define DEFN(TYPE) \
+INLINE_OVERLOADABLE void prefetch(const global TYPE *p, size_t num) { }
+#define DEF(TYPE) \
+DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16)
+DEF(char);
+DEF(uchar);
+DEF(short);
+DEF(ushort);
+DEF(int);
+DEF(uint);
+DEF(long);
+DEF(ulong);
+DEF(float);
+#undef DEFN
+#undef DEF
+
+/////////////////////////////////////////////////////////////////////////////
+// Atomic functions
+/////////////////////////////////////////////////////////////////////////////
+OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__global uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__local uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__local uint *p, uint val);
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
+  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+    return (TYPE)__gen_ocl_##PREFIX##NAME((SPACE uint *)p, val);            \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE, PREFIX) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global, PREFIX) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX)
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_)        \
+  DECL_ATOMIC_OP_TYPE(NAME, int, atomic_)
+
+DECL_ATOMIC_OP(add)
+DECL_ATOMIC_OP(sub)
+DECL_ATOMIC_OP(and)
+DECL_ATOMIC_OP(or)
+DECL_ATOMIC_OP(xor)
+DECL_ATOMIC_OP(xchg)
+DECL_ATOMIC_OP_TYPE(min, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(max, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(min, uint, atomic_u)
+DECL_ATOMIC_OP_TYPE(max, uint, atomic_u)
+
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
+  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+    return as_float(__gen_ocl_##PREFIX##NAME((SPACE uint *)p, as_uint(val))); \
+  }
+DECL_ATOMIC_OP_SPACE(xchg, float, __global, atomic_)
+DECL_ATOMIC_OP_SPACE(xchg, float, __local, atomic_)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE) \
+  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p) { \
+    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p); \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint) \
+  DECL_ATOMIC_OP_TYPE(NAME, int)
+
+DECL_ATOMIC_OP(inc)
+DECL_ATOMIC_OP(dec)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE)  \
+  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE cmp, TYPE val) { \
+    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p, (uint)cmp, (uint)val); \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint) \
+  DECL_ATOMIC_OP_TYPE(NAME, int)
+
+DECL_ATOMIC_OP(cmpxchg)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+// XXX for conformance test
+// The following atom_xxx api is on OpenCL spec 1.0.
+// But the conformance test suite will test them anyway.
+#define atom_add atomic_add
+#define atom_sub atomic_sub
+#define atom_and atomic_and
+#define atom_or atomic_or
+#define atom_xor atomic_xor
+#define atom_xchg atomic_xchg
+#define atom_min atomic_min
+#define atom_max atomic_max
+#define atom_inc atomic_inc
+#define atom_dec atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
+
+/////////////////////////////////////////////////////////////////////////////
+// Force the compilation to SIMD8 or SIMD16
+/////////////////////////////////////////////////////////////////////////////
+
+int __gen_ocl_force_simd8(void);
+int __gen_ocl_force_simd16(void);
+
+#define NULL ((void*)0)
+
+// ##BEGIN_COMMON_DEFINES##
+// ##END_COMMON_DEFINES##
+
+/////////////////////////////////////////////////////////////////////////////
+// Image access functions
+/////////////////////////////////////////////////////////////////////////////
+
+// 1D read
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
+
+// 2D & 1D Array read
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+
+// 3D & 2D Array read
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+
+// 1D write
+OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int4 color);
+OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, uint4 color);
+OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, float4 color);
+
+// 2D & 1D Array write
+OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
+OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, uint4 color);
+OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, float4 color);
+
+// 3D & 2D Array write
+OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int w, int4 color);
+OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, int w, uint4 color);
+OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, int w, float4 color);
+
+int __gen_ocl_get_image_width(uint surface_id);
+int __gen_ocl_get_image_height(uint surface_id);
+int __gen_ocl_get_image_channel_data_type(uint surface_id);
+int __gen_ocl_get_image_channel_order(uint surface_id);
+int __gen_ocl_get_image_depth(uint surface_id);
+/* The printf function. */
+/* From LLVM 3.4, c string are all in constant address space */
+#if 100*__clang_major__ + __clang_minor__ < 304
+int __gen_ocl_printf_stub(const char * format, ...);
+#else
+int __gen_ocl_printf_stub(constant char * format, ...);
+#endif
+#define printf __gen_ocl_printf_stub
+
+// 2D 3D Image Common Macro
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+#define GEN_FIX_1 1
+#else
+#define GEN_FIX_1 0
+#endif
+
+#define GET_IMAGE(cl_image, surface_id) \
+    uint surface_id = (uint)cl_image
+INLINE_OVERLOADABLE float __gen_compute_array_index(const float index, image1d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  float array_size = __gen_ocl_get_image_depth(surface_id);
+  return clamp(rint(index), 0.f, array_size - 1.f);
+}
+
+INLINE_OVERLOADABLE float __gen_compute_array_index(float index, image2d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  float array_size = __gen_ocl_get_image_depth(surface_id);
+  return clamp(rint(index), 0.f, array_size - 1.f);
+}
+
+INLINE_OVERLOADABLE int __gen_compute_array_index(int index, image1d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  int array_size = __gen_ocl_get_image_depth(surface_id);
+  return clamp(index, 0, array_size - 1);
+}
+
+INLINE_OVERLOADABLE int __gen_compute_array_index(int index, image2d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  int array_size = __gen_ocl_get_image_depth(surface_id);
+  return clamp(index, 0, array_size - 1);
+}
+
+#define DECL_READ_IMAGE0(int_clamping_fix,                                   \
+                        image_type, type, suffix, coord_type, n)             \
+  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
+                                               const sampler_t sampler,      \
+                                               coord_type coord)             \
+  {                                                                          \
+    GET_IMAGE(cl_image, surface_id);                                         \
+    GET_IMAGE_ARRAY_SIZE(cl_image, coord, int, ai);                          \
+    if (int_clamping_fix &&                                                  \
+        ((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) &&             \
+        ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST))               \
+            return   __gen_ocl_read_image ##suffix(                          \
+                        EXPEND_READ_COORD(surface_id, sampler, coord));      \
+    return  __gen_ocl_read_image ##suffix(                                   \
+                    EXPEND_READ_COORDF(surface_id, sampler, coord), 0);      \
+  }
+
+#define DECL_READ_IMAGE1(float_coord_rounding_fix, int_clamping_fix,         \
+                        image_type, type, suffix, coord_type, n)             \
+  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
+                                               const sampler_t sampler,      \
+                                               coord_type coord)             \
+  {                                                                          \
+    GET_IMAGE(cl_image, surface_id);                                         \
+    GET_IMAGE_ARRAY_SIZE(cl_image, coord, float, ai)                         \
+    coord_type tmpCoord = coord;                                             \
+    if (float_coord_rounding_fix | int_clamping_fix) {                       \
+      if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)              \
+          && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {        \
+        if (float_coord_rounding_fix                                         \
+            && ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0)) {              \
+          FIXUP_FLOAT_COORD(tmpCoord);                                       \
+        }                                                                    \
+        if (int_clamping_fix) {                                              \
+            coord_type intCoord;                                             \
+            if (sampler & CLK_NORMALIZED_COORDS_TRUE) {                      \
+              DENORMALIZE_COORD(surface_id, intCoord, tmpCoord);             \
+            } else                                                           \
+              intCoord = tmpCoord;                                           \
+            return   __gen_ocl_read_image ##suffix(                          \
+                       EXPEND_READ_COORDI(surface_id, sampler, intCoord));\
+       }                                                                     \
+      }                                                                      \
+    }                                                                        \
+    return  __gen_ocl_read_image ##suffix(                                   \
+                        EXPEND_READ_COORDF(surface_id, sampler, tmpCoord), 0);\
+  }
+
+#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type, n)   \
+  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
+                                               coord_type coord)             \
+  {                                                                          \
+    GET_IMAGE(cl_image, surface_id);                                         \
+    GET_IMAGE_ARRAY_SIZE(cl_image, coord, int, ai)                           \
+    return __gen_ocl_read_image ##suffix(                                    \
+           EXPEND_READ_COORDF(surface_id,                                    \
+                             CLK_NORMALIZED_COORDS_FALSE                     \
+                             | CLK_ADDRESS_NONE                              \
+                             | CLK_FILTER_NEAREST, (float)coord), 0);        \
+  }
+
+#define DECL_WRITE_IMAGE(image_type, type, suffix, coord_type) \
+  INLINE_OVERLOADABLE void write_image ##suffix(image_type cl_image, coord_type coord, type color)\
+  {\
+    GET_IMAGE(cl_image, surface_id);\
+    __gen_ocl_write_image ##suffix(EXPEND_WRITE_COORD(surface_id, coord, color));\
+  }
+
+#define DECL_IMAGE_INFO_COMMON(image_type)    \
+  INLINE_OVERLOADABLE  int get_image_channel_data_type(image_type image)\
+  { \
+    GET_IMAGE(image, surface_id);\
+    return __gen_ocl_get_image_channel_data_type(surface_id); \
+  }\
+  INLINE_OVERLOADABLE  int get_image_channel_order(image_type image)\
+  { \
+    GET_IMAGE(image, surface_id);\
+    return __gen_ocl_get_image_channel_order(surface_id); \
+  } \
+  INLINE_OVERLOADABLE int get_image_width(image_type image) \
+  { \
+    GET_IMAGE(image, surface_id); \
+    return __gen_ocl_get_image_width(surface_id);  \
+  }
+
+// 1D
+#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix)                       \
+  DECL_READ_IMAGE0(int_clamping_fix, image_type, type, suffix, int, 1)               \
+  DECL_READ_IMAGE1(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float, 1)  \
+  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int, 1)                        \
+  DECL_WRITE_IMAGE(image_type, type, suffix, int)                                    \
+  DECL_WRITE_IMAGE(image_type, type, suffix, float)
+
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord < 0 ? -1 : coord), 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord = srcCoord * __gen_ocl_get_image_width(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord, color
+#define GET_IMAGE_ARRAY_SIZE(a,b,c,d)
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                            \
+  {                                                            \
+    if (tmpCoord < 0 && tmpCoord > -0x1p-20f)                  \
+      tmpCoord += -0x1p-9;                                     \
+  }
+
+DECL_IMAGE(GEN_FIX_1, image1d_t, int4, i)
+DECL_IMAGE(GEN_FIX_1, image1d_t, uint4, ui)
+DECL_IMAGE(0, image1d_t, float4, f)
+DECL_IMAGE(GEN_FIX_1, image1d_buffer_t, int4, i)
+DECL_IMAGE(GEN_FIX_1, image1d_buffer_t, uint4, ui)
+DECL_IMAGE(0, image1d_buffer_t, float4, f)
+
+// 1D Info
+DECL_IMAGE_INFO_COMMON(image1d_t)
+DECL_IMAGE_INFO_COMMON(image1d_buffer_t)
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef DECL_IMAGE
+// End of 1D
+
+#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix, n)                       \
+  DECL_READ_IMAGE0(int_clamping_fix, image_type, type, suffix, int ##n, n)              \
+  DECL_READ_IMAGE1(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float ##n, n) \
+  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n, n)                       \
+  DECL_WRITE_IMAGE(image_type, type, suffix, int ## n)                                  \
+  DECL_WRITE_IMAGE(image_type, type, suffix, float ## n)
+// 2D
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord.s0 < 0 ? -1 : coord.s0), \
+                                               (int)(coord.s1 < 0 ? -1 : coord.s1), 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
+                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, color
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                            \
+  {                                                            \
+    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)            \
+      tmpCoord.s0 += -0x1p-9;                                  \
+    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f)            \
+      tmpCoord.s1 += -0x1p-9f;                                 \
+  }
+
+DECL_IMAGE(GEN_FIX_1, image2d_t, int4, i, 2)
+DECL_IMAGE(GEN_FIX_1, image2d_t, uint4, ui, 2)
+DECL_IMAGE(0, image2d_t, float4, f, 2)
+
+// 1D Array
+#undef GET_IMAGE_ARRAY_SIZE
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, (int)0, ai, 2
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)ai
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord.s0 < 0 ? -1 : coord.s0), 0, (int)ai, 2
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, __gen_compute_array_index(coord.s1, cl_image), color
+#define GET_IMAGE_ARRAY_SIZE(image, coord, coord_type, ai) \
+  coord_type ai = __gen_compute_array_index(coord.s1, image);
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                            \
+  {                                                            \
+    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)            \
+      tmpCoord.s0 += -0x1p-9;                                  \
+  }
+
+DECL_IMAGE(GEN_FIX_1, image1d_array_t, int4, i, 2)
+DECL_IMAGE(GEN_FIX_1, image1d_array_t, uint4, ui, 2)
+DECL_IMAGE(0, image1d_array_t, float4, f, 2)
+
+// 2D Info
+DECL_IMAGE_INFO_COMMON(image2d_t)
+INLINE_OVERLOADABLE int get_image_height(image2d_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_height(surface_id);
+}
+INLINE_OVERLOADABLE int2 get_image_dim(image2d_t image)
+{
+  return (int2){get_image_width(image), get_image_height(image)};
+}
+
+// 1D Array info
+DECL_IMAGE_INFO_COMMON(image1d_array_t)
+INLINE_OVERLOADABLE size_t get_image_array_size(image1d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_depth(surface_id);
+}
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDI
+#undef EXPEND_READ_COORDF
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef GET_IMAGE_ARRAY_SIZE
+// End of 2D and 1D Array
+
+// 3D
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, coord.s2, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1, (float)coord.s2
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int) (coord.s0 < 0 ? -1 : coord.s0), \
+                                               (int)(coord.s1 < 0 ? -1 : coord.s1), (int)(coord.s2 < 0 ? -1 : coord.s2), 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
+                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id); \
+                                                  dstCoord.z = srcCoord.z * __gen_ocl_get_image_depth(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, coord.s2, color
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                             \
+  {                                                             \
+    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20)              \
+      tmpCoord.s0 += -0x1p-9;                                   \
+    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20)              \
+      tmpCoord.s1 += -0x1p-9;                                   \
+    if (tmpCoord.s2 < 0 && tmpCoord.s2 > -0x1p-20)              \
+      tmpCoord.s2 += -0x1p-9;                                   \
+  }
+#define GET_IMAGE_ARRAY_SIZE(a,b,c,d)
+
+DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 4)
+DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 4)
+DECL_IMAGE(0, image3d_t, float4, f, 4)
+
+DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 3)
+DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 3)
+DECL_IMAGE(0, image3d_t, float4, f, 3)
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef GET_IMAGE_ARRAY_SIZE
+
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, ai, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1, (float)ai
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int) (coord.s0 < 0 ? -1 : coord.s0), \
+                                               (int)(coord.s1 < 0 ? -1 : coord.s1), (int)ai, 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
+                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, __gen_compute_array_index(coord.s2, cl_image), color
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                             \
+  {                                                             \
+    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20)              \
+      tmpCoord.s0 += -0x1p-9;                                   \
+    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20)              \
+      tmpCoord.s1 += -0x1p-9;                                   \
+  }
+#define GET_IMAGE_ARRAY_SIZE(image, coord, coord_type, ai) \
+  coord_type ai = __gen_compute_array_index(coord.s2, image);
+
+// 2D Array
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, int4, i, 4)
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, uint4, ui, 4)
+DECL_IMAGE(0, image2d_array_t, float4, f, 4)
+
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, int4, i, 3)
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, uint4, ui, 3)
+DECL_IMAGE(0, image2d_array_t, float4, f, 3)
+
+// 3D Info
+DECL_IMAGE_INFO_COMMON(image3d_t)
+INLINE_OVERLOADABLE int get_image_height(image3d_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_height(surface_id);
+}
+INLINE_OVERLOADABLE int get_image_depth(image3d_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_depth(surface_id);
+}
+INLINE_OVERLOADABLE int4 get_image_dim(image3d_t image)
+{
+  return (int4){get_image_width(image), get_image_height(image), get_image_depth(image), 0};
+}
+
+// 2D Array Info
+DECL_IMAGE_INFO_COMMON(image2d_array_t)
+INLINE_OVERLOADABLE int get_image_height(image2d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_height(surface_id);
+}
+INLINE_OVERLOADABLE int2 get_image_dim(image2d_array_t image)
+{
+  return (int2){get_image_width(image), get_image_height(image)};
+}
+INLINE_OVERLOADABLE size_t get_image_array_size(image2d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_depth(surface_id);
+}
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef GET_IMAGE_ARRAY_SIZE
+// End of 3D and 2D Array
+
+#undef DECL_IMAGE
+#undef DECL_READ_IMAGE
+#undef DECL_READ_IMAGE_NOSAMPLER
+#undef DECL_WRITE_IMAGE
+#undef GEN_FIX_1
+// End of Image
+
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_acosh (float x)
+{
+    return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_asinh (float x)
+{
+    return native_log(x + native_sqrt(x * x + 1));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_atanh (float x)
+{
+    return 0.5f * native_log((1 + x) / (1 - x));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cbrt (float x)
+{
+    return __gen_ocl_pow(x, 0.3333333333f);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cos (float x)
+{
+    return native_cos(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cosh (float x)
+{
+    return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cospi (float x)
+{
+    return __gen_ocl_cos(x * M_PI_F);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_exp (float x)
+{
+    return native_exp(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_exp10 (float x)
+{
+    return native_exp10(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_expm1 (float x)
+{
+    return __gen_ocl_pow(M_E_F, x) - 1;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_fmod (float x, float y)
+{
+    return x-y*__gen_ocl_rndz(x/y);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_hypot (float x, float y)
+{
+    return __gen_ocl_sqrt(x*x + y*y);
+}
+
+INLINE_OVERLOADABLE int __gen_ocl_internal_fastpath_ilogb (float x)
+{
+    return __gen_ocl_rndd(native_log2(x));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_ldexp (float x, int n)
+{
+    return __gen_ocl_pow(2, n) * x;
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log (float x)
+{
+    return native_log(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log2 (float x)
+{
+    return native_log2(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log10 (float x)
+{
+    return native_log10(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log1p (float x)
+{
+    return native_log(x + 1);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_logb (float x)
+{
+    return __gen_ocl_rndd(native_log2(x));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_remainder (float x, float y)
+{
+    return x-y*__gen_ocl_rnde(x/y);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_rootn(float x, int n)
+{
+  return internal_rootn(x, n, 1);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sin (float x)
+{
+    return native_sin(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __global float *cosval)
+{
+    *cosval = native_cos(x);
+    return native_sin(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __local float *cosval)
+{
+    *cosval = native_cos(x);
+    return native_sin(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __private float *cosval)
+{
+    *cosval = native_cos(x);
+    return native_sin(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sinh (float x)
+{
+    return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sinpi (float x)
+{
+    return __gen_ocl_sin(x * M_PI_F);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_tan (float x)
+{
+    return native_tan(x);
+}
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_tanh (float x)
+{
+    float y = native_exp(-2 * x);
+    return (1 - y) / (1 + y);
+}
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : disable
+
+#undef DECL_IMAGE
+#undef DECL_READ_IMAGE
+#undef DECL_READ_IMAGE_NOSAMPLER
+#undef DECL_WRITE_IMAGE
+
+#undef GET_IMAGE
+// ##BEGIN_VECTOR##
+// ##END_VECTOR##
+
+#undef INLINE_OVERLOADABLE
+#undef PURE
+#undef CONST
+#undef OVERLOADABLE
+#undef INLINE
+
+#endif /* __GEN_OCL_STDLIB_H__ */
diff --git a/backend/src/sys/alloc.cpp b/backend/src/sys/alloc.cpp
new file mode 100644
index 0000000..2db95c9
--- /dev/null
+++ b/backend/src/sys/alloc.cpp
@@ -0,0 +1,359 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file alloc.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ *  Provides facilities to track allocations and pre-initialize memory at
+ *  memory allocation and memory free time
+ */
+#include "sys/alloc.hpp"
+#include "sys/atomic.hpp"
+#include "sys/mutex.hpp"
+
+#if GBE_DEBUG_MEMORY
+#include <tr1/unordered_map>
+#include <cstring>
+#endif /* GBE_DEBUG_MEMORY */
+
+#if defined(__ICC__)
+#include <stdint.h>
+#endif /* __ICC__ */
+#include <map>
+#include <vector>
+#include <iomanip>
+
+////////////////////////////////////////////////////////////////////////////////
+/// Memory debugger
+////////////////////////////////////////////////////////////////////////////////
+
+#if GBE_DEBUG_MEMORY
+namespace gbe
+{
+  /*! Store each allocation data */
+  struct AllocData {
+    INLINE AllocData(void) {}
+    INLINE AllocData(int fileName_, int functionName_, int line_, intptr_t alloc_) :
+      fileName(fileName_), functionName(functionName_), line(line_), alloc(alloc_) {}
+    int fileName, functionName, line;
+    intptr_t alloc;
+  };
+
+  /*! Store allocation information */
+  struct MemDebugger {
+    MemDebugger(void) : unfreedNum(0), allocNum(0) {}
+    ~MemDebugger(void) { this->dumpAlloc(); }
+    void* insertAlloc(void *ptr, const char *file, const char *function, int line);
+    void removeAlloc(void *ptr);
+    void dumpAlloc(void);
+    void dumpData(const AllocData &data);
+    /*! Count the still unfreed allocations */
+    volatile intptr_t unfreedNum;
+    /*! Total number of allocations done */
+    volatile intptr_t allocNum;
+    /*! Sorts the file name and function name strings */
+    std::tr1::unordered_map<const char*, int> staticStringMap;
+    /*! Each element contains the actual string */
+    std::vector<const char*> staticStringVector;
+    std::map<uintptr_t, AllocData> allocMap;
+    /*! Protect the memory debugger accesses */
+    MutexSys mutex;
+  };
+
+  void* MemDebugger::insertAlloc(void *ptr, const char *file, const char *function, int line)
+  {
+    if (ptr == NULL) return ptr;
+    Lock<MutexSys> lock(mutex);
+    const uintptr_t iptr = (uintptr_t) ptr;
+    if (UNLIKELY(allocMap.find(iptr) != allocMap.end())) {
+      this->dumpData(allocMap.find(iptr)->second);
+      FATAL("Pointer already in map");
+    }
+    const auto fileIt = staticStringMap.find(file);
+    const auto functionIt = staticStringMap.find(function);
+    int fileName, functionName;
+    if (fileIt == staticStringMap.end()) {
+      staticStringVector.push_back(file);
+      staticStringMap[file] = fileName = int(staticStringVector.size()) - 1;
+    } else
+      fileName = staticStringMap[file];
+    if (functionIt == staticStringMap.end()) {
+      staticStringVector.push_back(function);
+      staticStringMap[function] = functionName = int(staticStringVector.size()) - 1;
+    } else
+      functionName = staticStringMap[function];
+    allocMap[iptr] = AllocData(fileName, functionName, line, allocNum);
+    unfreedNum++;
+    allocNum++;
+    return ptr;
+  }
+
+  void MemDebugger::removeAlloc(void *ptr)
+  {
+    if (ptr == NULL) return;
+    Lock<MutexSys> lock(mutex);
+    const uintptr_t iptr = (uintptr_t) ptr;
+    FATAL_IF(allocMap.find(iptr) == allocMap.end(), "Pointer not referenced");
+    allocMap.erase(iptr);
+    unfreedNum--;
+  }
+
+  void MemDebugger::dumpData(const AllocData &data) {
+    std::cerr << "ALLOC " << data.alloc << ": " <<
+                 "file " << staticStringVector[data.fileName] << ", " <<
+                 "function " << staticStringVector[data.functionName] << ", " <<
+                 "line " << data.line << std::endl;
+  }
+
+  void MemDebugger::dumpAlloc(void) {
+    std::cerr << "MemDebugger: Unfreed number: " << unfreedNum << std::endl;
+    for (const auto &alloc : allocMap) this->dumpData(alloc.second);
+    std::cerr << "MemDebugger: " << staticStringVector.size()
+              << " allocated static strings" << std::endl;
+  }
+
+  /*! The user can deactivate the memory initialization */
+  static bool memoryInitializationEnabled = true;
+
+  /*! Declare C like interface functions here */
+  static MemDebugger *memDebugger = NULL;
+
+  /*! Monitor maximum memory requirement in the compiler */
+  static MutexSys *sizeMutex = NULL;
+  static bool isMutexInitializing = true;
+  static size_t memDebuggerCurrSize(0u);
+  static size_t memDebuggerMaxSize(0u);
+  static void SizeMutexDeallocate(void) { if (sizeMutex) delete sizeMutex; }
+  static void SizeMutexAllocate(void) {
+    if (sizeMutex == NULL && isMutexInitializing == false) {
+      isMutexInitializing = true;
+      sizeMutex = new MutexSys;
+      atexit(SizeMutexDeallocate);
+    }
+  }
+
+  /*! Stop the memory debugger */
+  static void MemDebuggerEnd(void) {
+    MemDebugger *_debug = memDebugger;
+    memDebugger = NULL;
+    std::cout << "Maximum memory consumption: "
+              << std::setprecision(2) << std::fixed
+              << float(memDebuggerMaxSize) / 1024. << "KB" << std::endl;
+    delete _debug;
+    GBE_ASSERT(memDebuggerCurrSize == 0);
+  }
+
+  /*! Bring up the debugger at pre-main */
+  static struct ForceMemDebugger {
+    ForceMemDebugger(void) {
+      doesnotmatter = GBE_NEW(int);
+      GBE_DELETE(doesnotmatter);
+    }
+    int *doesnotmatter;
+  } forceMemDebugger;
+
+  /*! Start the memory debugger */
+  static void MemDebuggerStart(void) {
+    if (memDebugger == NULL) {
+      atexit(MemDebuggerEnd);
+      memDebugger = new MemDebugger;
+    }
+  }
+
+  void* MemDebuggerInsertAlloc(void *ptr, const char *file, const char *function, int line) {
+    if (memDebugger == NULL) MemDebuggerStart();
+    return memDebugger->insertAlloc(ptr, file, function, line);
+  }
+  void MemDebuggerRemoveAlloc(void *ptr) {
+    if (memDebugger == NULL) MemDebuggerStart();
+    memDebugger->removeAlloc(ptr);
+  }
+  void MemDebuggerDumpAlloc(void) {
+    if (memDebugger == NULL) MemDebuggerStart();
+    memDebugger->dumpAlloc();
+  }
+  void MemDebuggerEnableMemoryInitialization(bool enabled) {
+    memoryInitializationEnabled = enabled;
+  }
+  void MemDebuggerInitializeMem(void *mem, size_t sz) {
+    if (memoryInitializationEnabled) std::memset(mem, 0xcd, sz);
+  }
+} /* namespace gbe */
+
+#endif /* GBE_DEBUG_MEMORY */
+
+namespace gbe
+{
+#if GBE_DEBUG_MEMORY
+  void* memAlloc(size_t size) {
+    void *ptr = std::malloc(size + sizeof(size_t));
+    *(size_t *) ptr = size;
+    MemDebuggerInitializeMem((char*) ptr + sizeof(size_t), size);
+    SizeMutexAllocate();
+    if (sizeMutex) sizeMutex->lock();
+    memDebuggerCurrSize += size;
+    memDebuggerMaxSize = std::max(memDebuggerCurrSize, memDebuggerMaxSize);
+    if (sizeMutex) sizeMutex->unlock();
+    return (char *) ptr + sizeof(size_t);
+  }
+  void memFree(void *ptr) {
+    if (ptr != NULL) {
+      char *toFree = (char*) ptr - sizeof(size_t);
+      const size_t size = *(size_t *) toFree;
+      MemDebuggerInitializeMem(ptr, size);
+      SizeMutexAllocate();
+      if (sizeMutex) sizeMutex->lock();
+      memDebuggerCurrSize -= size;
+      if (sizeMutex) sizeMutex->unlock();
+      std::free(toFree);
+    }
+  }
+#else
+  void* memAlloc(size_t size) { return  std::malloc(size); }
+  void memFree(void *ptr) { if (ptr != NULL) std::free(ptr); }
+#endif /* GBE_DEBUG_MEMORY */
+
+} /* namespace gbe */
+
+#if GBE_DEBUG_MEMORY
+
+namespace gbe
+{
+  void* alignedMalloc(size_t size, size_t align) {
+    void* mem = malloc(size+align+sizeof(uintptr_t) + sizeof(void*));
+    FATAL_IF (!mem && size, "memory allocation failed");
+    char* aligned = (char*) mem + sizeof(uintptr_t) + sizeof(void*);
+    aligned += align - ((uintptr_t)aligned & (align - 1));
+    ((void**)aligned)[-1] = mem;
+    ((uintptr_t*)aligned)[-2] = uintptr_t(size);
+    MemDebuggerInitializeMem(aligned, size);
+    SizeMutexAllocate();
+    if (sizeMutex) sizeMutex->lock();
+    memDebuggerCurrSize += size;
+    memDebuggerMaxSize = std::max(memDebuggerCurrSize, memDebuggerMaxSize);
+    if (sizeMutex) sizeMutex->unlock();
+    return aligned;
+  }
+
+  void alignedFree(void* ptr) {
+    if (ptr) {
+      const size_t size = ((uintptr_t*)ptr)[-2];
+      MemDebuggerInitializeMem(ptr, size);
+      free(((void**)ptr)[-1]);
+      SizeMutexAllocate();
+      if (sizeMutex) sizeMutex->lock();
+      memDebuggerCurrSize -= size;
+      if (sizeMutex) sizeMutex->unlock();
+    }
+  }
+} /* namespace gbe */
+
+#else /* GBE_DEBUG_MEMORY */
+
+////////////////////////////////////////////////////////////////////////////////
+/// Linux Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__LINUX__) || defined(__GLIBC__)
+
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <malloc.h>
+#include <iostream>
+
+namespace gbe
+{
+  void* alignedMalloc(size_t size, size_t align) {
+    void* ptr = memalign(align,size);
+    FATAL_IF (!ptr && size, "memory allocation failed");
+    MemDebuggerInitializeMem(ptr, size);
+    return ptr;
+  }
+
+  void alignedFree(void *ptr) { if (ptr) std::free(ptr); }
+} /* namespace gbe */
+
+#else
+#error "Unsupported platform"
+#endif /* __LINUX__ */
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Linear allocator
+////////////////////////////////////////////////////////////////////////////////
+
+namespace gbe
+{
+  LinearAllocator::Segment::Segment(size_t size) :
+    size(size), offset(0u), data(alignedMalloc(size, CACHE_LINE)), next(NULL){}
+
+  LinearAllocator::Segment::~Segment(void) {
+    alignedFree(data);
+    if (this->next) GBE_DELETE(this->next);
+  }
+
+  LinearAllocator::LinearAllocator(size_t minSize, size_t maxSize) :
+    maxSize(std::max(maxSize, size_t(CACHE_LINE)))
+  {
+    this->curr = GBE_NEW(LinearAllocator::Segment, std::max(minSize, size_t(1)));
+  }
+
+  LinearAllocator::~LinearAllocator(void) {
+    if (this->curr) GBE_DELETE(this->curr);
+  }
+
+  void *LinearAllocator::allocate(size_t size)
+  {
+#if GBE_DEBUG_SPECIAL_ALLOCATOR
+    if (ptr) GBE_ALIGNED_MALLOC(size, sizeof(void*));
+#else
+    // Try to use the current segment. This is the most likely condition here
+    this->curr->offset = ALIGN(this->curr->offset, sizeof(void*));
+    if (this->curr->offset + size <= this->curr->size) {
+      char *ptr = (char*) curr->data + this->curr->offset;
+      this->curr->offset += size;
+      return (void*) ptr;
+    }
+
+    // Well not really a use case in this code base
+    if (UNLIKELY(size > maxSize)) {
+      // This is really bad since we do two allocations
+      Segment *unfortunate = GBE_NEW(Segment, size);
+      GBE_ASSERT(this->curr);
+      Segment *next = this->curr->next;
+      this->curr->next = unfortunate;
+      unfortunate->next = next;
+      return unfortunate->data;
+    }
+
+    // OK. We need a new segment
+    const size_t segmentSize = std::max(size, 2*this->curr->size);
+    Segment *next = GBE_NEW(Segment, segmentSize);
+    next->next = curr;
+    this->curr = next;
+    char *ptr = (char*) curr->data;
+    this->curr->offset += size;
+    return ptr;
+#endif
+  }
+
+} /* namespace gbe */
+
diff --git a/backend/src/sys/alloc.hpp b/backend/src/sys/alloc.hpp
new file mode 100644
index 0000000..8fcb3a7
--- /dev/null
+++ b/backend/src/sys/alloc.hpp
@@ -0,0 +1,342 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file alloc.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_ALLOC_HPP__
+#define __GBE_ALLOC_HPP__
+
+#include "sys/platform.hpp"
+#include "sys/assert.hpp"
+#include <algorithm>
+#include <limits>
+
+namespace gbe
+{
+  /*! regular allocation */
+  void* memAlloc(size_t size);
+  void  memFree(void *ptr);
+
+  /*! Aligned allocation */
+  void* alignedMalloc(size_t size, size_t align = 64);
+  void  alignedFree(void* ptr);
+
+  /*! Monitor memory allocations */
+#if GBE_DEBUG_MEMORY
+  void* MemDebuggerInsertAlloc(void*, const char*, const char*, int);
+  void  MemDebuggerRemoveAlloc(void *ptr);
+  void  MemDebuggerDumpAlloc(void);
+  void  MemDebuggerInitializeMem(void *mem, size_t sz);
+  void  MemDebuggerEnableMemoryInitialization(bool enabled);
+#else
+  INLINE void* MemDebuggerInsertAlloc(void *ptr, const char*, const char*, int) {return ptr;}
+  INLINE void  MemDebuggerRemoveAlloc(void *ptr) {}
+  INLINE void  MemDebuggerDumpAlloc(void) {}
+  INLINE void  MemDebuggerInitializeMem(void *mem, size_t sz) {}
+  INLINE void  MemDebuggerEnableMemoryInitialization(bool enabled) {}
+#endif /* GBE_DEBUG_MEMORY */
+
+  /*! Properly handle the allocated type */
+  template <typename T>
+  T* _MemDebuggerInsertAlloc(T *ptr, const char *file, const char *function, int line) {
+    MemDebuggerInsertAlloc(ptr, file, function, line);
+    return ptr;
+  }
+} /* namespace gbe */
+
+/*! Declare a class with custom allocators */
+#define GBE_CLASS(TYPE) \
+  GBE_STRUCT(TYPE) \
+private:
+
+/*! Declare a structure with custom allocators */
+#define GBE_STRUCT(TYPE) \
+public: \
+  void* operator new(size_t size) { \
+    return gbe::alignedMalloc(size, GBE_DEFAULT_ALIGNMENT); \
+  } \
+  void* operator new[](size_t size) { \
+   return gbe::alignedMalloc(size, GBE_DEFAULT_ALIGNMENT); \
+  } \
+  void* operator new(size_t size, void *p) { return p; } \
+  void* operator new[](size_t size, void *p) { return p; } \
+  void  operator delete(void* ptr) { return gbe::alignedFree(ptr); } \
+  void  operator delete[](void* ptr) { return gbe::alignedFree(ptr); }
+
+/*! Macros to handle allocation position */
+#define GBE_NEW(T,...) \
+  gbe::_MemDebuggerInsertAlloc(new T(__VA_ARGS__), __FILE__, __FUNCTION__, __LINE__)
+
+#define GBE_NEW_NO_ARG(T) \
+  gbe::_MemDebuggerInsertAlloc(new T, __FILE__, __FUNCTION__, __LINE__)
+
+#define GBE_NEW_ARRAY(T,N,...) \
+  gbe::_MemDebuggerInsertAlloc(new T[N](__VA_ARGS__), __FILE__, __FUNCTION__, __LINE__)
+
+#define GBE_NEW_ARRAY_NO_ARG(T,N)\
+  gbe::_MemDebuggerInsertAlloc(new T[N], __FILE__, __FUNCTION__, __LINE__)
+
+#define GBE_NEW_P(T,X,...) \
+  gbe::_MemDebuggerInsertAlloc(new (X) T(__VA_ARGS__), __FILE__, __FUNCTION__, __LINE__)
+
+#define GBE_DELETE(X) \
+  do { gbe::MemDebuggerRemoveAlloc(X); delete X; } while (0)
+
+#define GBE_DELETE_ARRAY(X) \
+  do { gbe::MemDebuggerRemoveAlloc(X); delete[] X; } while (0)
+
+#define GBE_MALLOC(SZ) \
+  gbe::MemDebuggerInsertAlloc(gbe::memAlloc(SZ),__FILE__, __FUNCTION__, __LINE__)
+
+#define GBE_FREE(X) \
+  do { gbe::MemDebuggerRemoveAlloc(X); gbe::memFree(X); } while (0)
+
+#define GBE_ALIGNED_FREE(X) \
+  do { gbe::MemDebuggerRemoveAlloc(X); gbe::alignedFree(X); } while (0)
+
+#define GBE_ALIGNED_MALLOC(SZ,ALIGN) \
+  gbe::MemDebuggerInsertAlloc(gbe::alignedMalloc(SZ,ALIGN),__FILE__, __FUNCTION__, __LINE__)
+
+namespace gbe
+{
+  /*! STL compliant allocator to intercept all memory allocations */
+  template<typename T>
+  class Allocator {
+  public:
+    typedef T value_type;
+    typedef value_type* pointer;
+    typedef const value_type* const_pointer;
+    typedef value_type& reference;
+    typedef const value_type& const_reference;
+    typedef std::size_t size_type;
+    typedef std::ptrdiff_t difference_type;
+    typedef typename std::allocator<void>::const_pointer void_allocator_ptr;
+    template<typename U>
+    struct rebind { typedef Allocator<U> other; };
+
+    INLINE Allocator(void) {}
+    INLINE ~Allocator(void) {}
+    INLINE Allocator(Allocator const&) {}
+    template<typename U>
+    INLINE Allocator(Allocator<U> const&) {}
+    INLINE pointer address(reference r) { return &r; }
+    INLINE const_pointer address(const_reference r) { return &r; }
+    INLINE pointer allocate(size_type n, void_allocator_ptr = 0) {
+      if (ALIGNOF(T) > sizeof(uintptr_t))
+        return (pointer) GBE_ALIGNED_MALLOC(n*sizeof(T), ALIGNOF(T));
+      else
+        return (pointer) GBE_MALLOC(n * sizeof(T));
+    }
+    INLINE void deallocate(pointer p, size_type) {
+      if (ALIGNOF(T) > sizeof(uintptr_t))
+        GBE_ALIGNED_FREE(p);
+      else
+        GBE_FREE(p);
+    }
+    INLINE size_type max_size(void) const {
+      return std::numeric_limits<size_type>::max() / sizeof(T);
+    }
+    INLINE void construct(pointer p, const T& t = T()) { ::new(p) T(t); }
+    INLINE void destroy(pointer p) { p->~T(); }
+    INLINE bool operator==(Allocator const&) { return true; }
+    INLINE bool operator!=(Allocator const& a) { return !operator==(a); }
+  };
+
+// Deactivate fast allocators
+#ifndef GBE_DEBUG_SPECIAL_ALLOCATOR
+#define GBE_DEBUG_SPECIAL_ALLOCATOR 0
+#endif
+
+  /*! A growing pool never gives memory to the system but chain free elements
+   *  together such as deallocation can be quickly done
+   */
+  template <typename T>
+  class GrowingPool
+  {
+  public:
+    GrowingPool(uint32_t elemNum = 1) :
+      curr(GBE_NEW(GrowingPoolElem, elemNum <= 1 ? 1 : elemNum)),
+      free(NULL), full(NULL), freeList(NULL) {}
+    ~GrowingPool(void) {
+      GBE_SAFE_DELETE(curr);
+      GBE_SAFE_DELETE(free);
+      GBE_SAFE_DELETE(full);
+    }
+    void *allocate(void) {
+#if GBE_DEBUG_SPECIAL_ALLOCATOR
+      return GBE_ALIGNED_MALLOC(sizeof(T), ALIGNOF(T));
+#else
+      // Pick up an element from the free list
+      if (this->freeList != NULL) {
+        void *data = (void*) freeList;
+        this->freeList = *(void**) freeList;
+        return data;
+      }
+
+      // Pick up an element from the current block (if not full)
+      if (this->curr->allocated < this->curr->maxElemNum) {
+        void *data = (T*) curr->data + curr->allocated++;
+        return data;
+      }
+
+      // Block is full
+      this->curr->next = this->full;
+      this->full = this->curr;
+
+      // Try to pick up a free block
+      if (this->free) this->getFreeBlock();
+
+      // No free block we must allocate a new one
+      else
+        this->curr = GBE_NEW(GrowingPoolElem, 2 * this->curr->maxElemNum);
+
+      void *data = (T*) curr->data + curr->allocated++;
+      return data;
+#endif /* GBE_DEBUG_SPECIAL_ALLOCATOR */
+    }
+    void deallocate(void *t) {
+      if (t == NULL) return;
+#if GBE_DEBUG_SPECIAL_ALLOCATOR
+      GBE_ALIGNED_FREE(t);
+#else
+      *(void**) t = this->freeList;
+      this->freeList = t;
+#endif /* GBE_DEBUG_SPECIAL_ALLOCATOR */
+    }
+    void rewind(void) {
+#if GBE_DEBUG_SPECIAL_ALLOCATOR == 0
+      // All free elements return to their blocks
+      this->freeList = NULL;
+
+      // Put back current block in full list
+      if (this->curr) {
+        this->curr->next = this->full;
+        this->full = this->curr;
+        this->curr = NULL;
+      }
+
+      // Reverse the chain list and mark all blocks as empty
+      while (this->full) {
+        GrowingPoolElem *next = this->full->next;
+        this->full->allocated = 0;
+        this->full->next = this->free;
+        this->free = this->full;
+        this->full = next;
+      }
+
+      // Provide a valid current block
+      this->getFreeBlock();
+#endif /* GBE_DEBUG_SPECIAL_ALLOCATOR */
+    }
+  private:
+    /*! Pick-up a free block */
+    INLINE void getFreeBlock(void) {
+      GBE_ASSERT(this->free);
+      this->curr = this->free;
+      this->free = this->free->next;
+      this->curr->next = NULL;
+    }
+    /*! Chunk of elements to allocate */
+    class GrowingPoolElem
+    {
+      friend class GrowingPool;
+      GrowingPoolElem(size_t elemNum) {
+        const size_t sz = std::max(sizeof(T), sizeof(void*));
+        this->data = (T*) GBE_ALIGNED_MALLOC(elemNum * sz, ALIGNOF(T));
+        this->next = NULL;
+        this->maxElemNum = elemNum;
+        this->allocated = 0;
+      }
+      ~GrowingPoolElem(void) {
+        GBE_ALIGNED_FREE(this->data);
+        if (this->next) GBE_DELETE(this->next);
+      }
+      T *data;
+      GrowingPoolElem *next;
+      size_t allocated, maxElemNum;
+    };
+    GrowingPoolElem *curr; //!< To get new element from
+    GrowingPoolElem *free; //!< Blocks that can be reused (after rewind)
+    GrowingPoolElem *full; //!< Blocks fully used
+    void *freeList;        //!< Elements that have been deallocated
+    GBE_CLASS(GrowingPool);
+  };
+
+/*! Helper macros to build and destroy objects with a growing pool */
+#define DECL_POOL(TYPE, POOL) \
+  GrowingPool<TYPE> POOL; \
+  template <typename... Args> \
+  TYPE *new##TYPE(Args&&... args) { \
+    return new (POOL.allocate()) TYPE(args...); \
+  } \
+  void delete##TYPE(TYPE *ptr) { \
+    ptr->~TYPE(); \
+    POOL.deallocate(ptr); \
+  }
+
+  /*! A linear allocator just grows and does not reuse freed memory. It can
+   *  however allocate objects of any size
+   */
+  class LinearAllocator
+  {
+  public:
+    /*! Initiate the linear allocator (one segment is allocated) */
+    LinearAllocator(size_t minSize = CACHE_LINE, size_t maxSize = 64*KB);
+    /*! Free up everything */
+    ~LinearAllocator(void);
+    /*! Allocate size bytes */
+    void *allocate(size_t size);
+    /*! Nothing here */
+    INLINE void deallocate(void *ptr) {
+#if GBE_DEBUG_SPECIAL_ALLOCATOR
+      if (ptr) GBE_ALIGNED_FREE(ptr);
+#endif /* GBE_DEBUG_SPECIAL_ALLOCATOR */
+    }
+  private:
+    /*! Helds an allocated segment of memory */
+    struct Segment {
+      /*! Allocate a new segment */
+      Segment(size_t size);
+      /*! Destroy the segment and the next ones */
+      ~Segment(void);
+      /* Size of the segment */
+      size_t size;
+      /*! Offset to the next free bytes (if any left) */
+      size_t offset;
+      /*! Pointer to valid data */
+      void *data;
+      /*! Pointer to the next segment */
+      Segment *next;
+      /*! Use internal allocator */
+      GBE_STRUCT(Segment);
+    };
+    /*! Points to the current segment we can allocate from */
+    Segment *curr;
+    /*! Maximum segment size */
+    size_t maxSize;
+    /*! Use internal allocator */
+    GBE_CLASS(LinearAllocator);
+  };
+
+} /* namespace gbe */
+
+#endif /* __GBE_ALLOC_HPP__ */
+
diff --git a/backend/src/sys/assert.cpp b/backend/src/sys/assert.cpp
new file mode 100644
index 0000000..52178a1
--- /dev/null
+++ b/backend/src/sys/assert.cpp
@@ -0,0 +1,81 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file assert.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#if GBE_COMPILE_UTESTS
+
+#include "sys/assert.hpp"
+#include "sys/exception.hpp"
+#include "sys/cvar.hpp"
+#include <cassert>
+#include <cstdlib>
+
+namespace gbe
+{
+  BVAR(OCL_BREAK_POINT_IN_ASSERTION, false);
+  BVAR(OCL_ABORT_IN_ASSERTION, false);
+
+  void onFailedAssertion(const char *msg, const char *file, const char *fn, int line)
+  {
+    char lineString[256];
+    sprintf(lineString, "%i", line);
+    assert(msg != NULL && file != NULL && fn != NULL);
+    const std::string str = "Compiler error: "
+                          + std::string(msg) + "\n  at file "
+                          + std::string(file)
+                          + ", function " + std::string(fn)
+                          + ", line " + std::string(lineString);
+    if (OCL_BREAK_POINT_IN_ASSERTION)
+      DEBUGBREAK();
+    if (OCL_ABORT_IN_ASSERTION) {
+      assert(false);
+      exit(-1);
+    }
+    throw Exception(str);
+  }
+} /* namespace gbe */
+
+#else
+
+#include "sys/assert.hpp"
+#include "sys/exception.hpp"
+#include "sys/platform.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <unistd.h>
+
+namespace gbe
+{
+  void onFailedAssertion(const char *msg, const char *file, const char *fn, int32_t line)
+  {
+    assert(msg != NULL && file != NULL && fn != NULL);
+    fprintf(stderr, "ASSERTION FAILED: %s\n"
+                    "  at file %s, function %s, line %i\n",
+                    msg,  file, fn, line);
+    fflush(stdout);
+    DEBUGBREAK();
+    _exit(-1);
+  }
+} /* namespace gbe */
+
+#endif /* GBE_COMPILE_UTESTS */
+
diff --git a/backend/src/sys/assert.hpp b/backend/src/sys/assert.hpp
new file mode 100644
index 0000000..553e391
--- /dev/null
+++ b/backend/src/sys/assert.hpp
@@ -0,0 +1,35 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file assert.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_ASSERT_HPP__
+#define __GBE_ASSERT_HPP__
+
+namespace gbe
+{
+  /*! To ensure that condition truth. Optional message is supported */
+  void onFailedAssertion(const char *msg, const char *file, const char *fn, int line);
+} /* namespace gbe */
+
+#endif /* __GBE_ASSERT_HPP__ */
+
diff --git a/backend/src/sys/atomic.hpp b/backend/src/sys/atomic.hpp
new file mode 100644
index 0000000..3684ae9
--- /dev/null
+++ b/backend/src/sys/atomic.hpp
@@ -0,0 +1,56 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __GBE_ATOMIC_HPP__
+#define __GBE_ATOMIC_HPP__
+
+#include "sys/intrinsics.hpp"
+
+namespace gbe
+{
+  template <typename T>
+  struct AtomicInternal {
+  protected:
+    AtomicInternal(const AtomicInternal&); // don't implement
+    AtomicInternal& operator= (const AtomicInternal&); // don't implement
+
+  public:
+    INLINE AtomicInternal(void) {}
+    INLINE AtomicInternal(T data) : data(data) {}
+    INLINE AtomicInternal& operator =(const T input) { data = input; return *this; }
+    INLINE operator T() const { return data; }
+    INLINE void storeRelease(T x) { __store_release(&data, x); }
+  public:
+    INLINE friend T operator+= (AtomicInternal& value, T input) { return atomic_add(&value.data, input) + input; }
+    INLINE friend T operator++ (AtomicInternal& value) { return atomic_add(&value.data,  1) + 1; }
+    INLINE friend T operator-- (AtomicInternal& value) { return atomic_add(&value.data, -1) - 1; }
+    INLINE friend T operator++ (AtomicInternal& value, int) { return atomic_add(&value.data,  1); }
+    INLINE friend T operator-- (AtomicInternal& value, int) { return atomic_add(&value.data, -1); }
+    INLINE friend T cmpxchg    (AtomicInternal& value, const T v, const T c) { return atomic_cmpxchg(&value.data,v,c); }
+
+  private:
+    volatile T data;
+    GBE_STRUCT(AtomicInternal);
+  };
+
+  typedef AtomicInternal<atomic32_t> Atomic32;
+  typedef AtomicInternal<atomic_t> Atomic;
+}
+
+#endif /* __GBE_ATOMIC_HPP__ */
+
diff --git a/backend/src/sys/cvar.cpp b/backend/src/sys/cvar.cpp
new file mode 100644
index 0000000..1ee2c98
--- /dev/null
+++ b/backend/src/sys/cvar.cpp
@@ -0,0 +1,65 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file cvar.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "sys/cvar.hpp"
+#include <cstdio>
+
+namespace gbe
+{
+
+  CVarInit::CVarInit(const char *name, int32_t *addr, int32_t imin, int32_t i, int32_t imax) :
+    varType(CVarInit::INTEGER)
+  {
+    this->i.min = imin;
+    this->i.max = imax;
+    const char *env = getenv(name);
+    if (env != NULL) {
+      sscanf(env, "%i", &i);
+      i = std::min(imax, std::max(imin, i));
+    }
+    *addr = i;
+  }
+
+  CVarInit::CVarInit(const char *name, float *addr, float fmin, float f, float fmax) :
+    varType(CVarInit::FLOAT)
+  {
+    this->f.min = fmin;
+    this->f.max = fmax;
+    const char *env = getenv(name);
+    if (env != NULL) {
+      sscanf(env, "%f", &f);
+      f = std::min(fmax, std::max(fmin, f));
+    }
+    *addr = f;
+  }
+
+  CVarInit::CVarInit(const char *name, std::string *str, const std::string &v) :
+    varType(CVarInit::STRING)
+  {
+    const char *env = getenv(name);
+    *str = env != NULL ? env : v;
+  }
+
+} /* namespace gbe */
+
diff --git a/backend/src/sys/cvar.hpp b/backend/src/sys/cvar.hpp
new file mode 100644
index 0000000..7350a3e
--- /dev/null
+++ b/backend/src/sys/cvar.hpp
@@ -0,0 +1,80 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file cvar.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * Quake like console variable system. Just use the environment variables from
+ * the console to change their value
+ */
+
+#ifndef __GBE_CVAR_HPP__
+#define __GBE_CVAR_HPP__
+
+#include "sys/platform.hpp"
+
+namespace gbe
+{
+  /*! A CVar is either a float, an integer or a string value. CVarInit is only
+   *  here to set the global variable in pre-main
+   */
+  class CVarInit
+  {
+  public:
+    enum {
+      STRING = 0,
+      INTEGER = 1,
+      FLOAT = 2
+    };
+    /*! Build a CVar from an integer environment variable */
+    explicit CVarInit(const char *name, int32_t *addr, int32_t imin, int32_t i, int32_t imax);
+    /*! Build a CVar from a float environment variable */
+    explicit CVarInit(const char *name, float *addr, float fmin, float f, float fmax);
+    /*! Build a CVar from a string environment variable */
+    explicit CVarInit(const char *name, std::string *str, const std::string &v);
+    int varType;      //!< STRING, INTEGER or FLOAT
+    std::string *str; //!< string variable
+    union {
+      struct { int32_t min, *curr, max; } i; //!< integer variables with bounds
+      struct { float   min, *curr, max; } f; //!< float variables with bounds
+    };
+  };
+} /* namespace gbe */
+
+/*! Declare an integer console variable */
+#define IVAR(NAME, MIN, CURR, MAX) \
+  int32_t NAME; \
+  static gbe::CVarInit __CVAR##NAME##__LINE__##__(#NAME, &NAME, int32_t(MIN), int32_t(CURR), int32_t(MAX));
+
+/*! Declare a float console variable */
+#define FVAR(NAME, MIN, CURR, MAX) \
+  float NAME; \
+  static gbe::CVarInit __CVAR##NAME##__LINE__##__(#NAME, &NAME, float(MIN), float(CURR), float(MAX));
+
+/*! Declare a string console variable */
+#define SVAR(NAME, STR) \
+  std::string NAME; \
+  static gbe::CVarInit __CVAR##NAME##__LINE__##__(#NAME, &NAME, STR);
+
+/*! Declare a Boolean variable (just an integer in {0,1}) */
+#define BVAR(NAME, CURR) IVAR(NAME, 0, CURR ? 1 : 0, 1)
+
+#endif /* __GBE_CVAR_HPP__ */
+
diff --git a/backend/src/sys/exception.hpp b/backend/src/sys/exception.hpp
new file mode 100644
index 0000000..d74ca0d
--- /dev/null
+++ b/backend/src/sys/exception.hpp
@@ -0,0 +1,56 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file exception.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_EXCEPTION_HPP__
+#define __GBE_EXCEPTION_HPP__
+
+#if GBE_COMPILE_UTESTS
+
+#include <exception>
+#include <string>
+
+namespace gbe
+{
+  /*! Exception are only used while using unit tests */
+  class Exception : public std::exception
+  {
+  public:
+    Exception(const std::string &msg) throw() : msg(msg) {}
+    Exception(const Exception &other) throw() : msg(other.msg) {}
+    ~Exception(void) throw() {}
+    Exception &operator= (const Exception &other) throw() {
+      this->msg = other.msg;
+      return *this;
+    }
+    const char *what(void) const throw() { return msg.c_str(); }
+  private:
+    std::string msg; //!< String message
+  };
+
+} /* namespace gbe */
+
+#endif /* GBE_COMPILE_UTESTS */
+#endif /* __GBE_EXCEPTION_HPP__ */
+
diff --git a/backend/src/sys/fixed_array.hpp b/backend/src/sys/fixed_array.hpp
new file mode 100644
index 0000000..d84c350
--- /dev/null
+++ b/backend/src/sys/fixed_array.hpp
@@ -0,0 +1,84 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file fixed_array.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_FIXED_ARRAY_HPP__
+#define __GBE_FIXED_ARRAY_HPP__
+
+#include "platform.hpp"
+#include <cstring>
+
+namespace gbe
+{
+  /*! Regular C array but with bound checks */
+  template<typename T, size_t N>
+  class fixed_array
+  {
+  public:
+    /*! Do not initialize the data */
+    fixed_array(void) {}
+    /*! Copy the input array */
+    fixed_array(const T array[N]) { std::memcpy(elem, array, N * sizeof(T)); }
+    /*! First element (non const) */
+    T* begin(void) { return &elem[0]; }
+    /*! First non-valid element (non const) */
+    T* end(void) { return begin() + N; }
+    /*! First element (const) */
+    const T* begin(void) const { return &elem[0]; }
+    /*! First non-valid element (const) */
+    const T* end(void) const { return begin() + N; }
+    /*! Number of elements in the array */
+    size_t size(void) const { return N; }
+    /*! Get the pointer to the data (non-const) */
+    T* data(void) { return &elem[0]; }
+    /*! Get the pointer to the data (const) */
+    const T* data(void) const { return &elem[0]; }
+    /*! First element (const) */
+    const T& front(void) const { return *begin(); }
+    /*! Last element (const) */
+    const T& back(void) const { return *(end() - 1); }
+    /*! First element (non-const) */
+    T& front(void) { return *begin(); }
+    /*! Last element (non-const) */
+    T& back(void) { return *(end() - 1); }
+    /*! Get element at position index (with bound check) */
+    INLINE T& operator[] (size_t index) {
+      GBE_ASSERT(index < size());
+      return elem[index];
+    }
+    /*! Get element at position index (with bound check) */
+    INLINE const T& operator[] (size_t index) const {
+      GBE_ASSERT(index < size());
+      return elem[index];
+    }
+  private:
+    T elem[N];            //!< Store the elements
+    STATIC_ASSERT(N > 0); //!< zero element is not allowed
+    GBE_CLASS(fixed_array);
+  };
+
+} /* namespace gbe */
+
+#endif /* __GBE_FIXED_ARRAY_HPP__ */
+
diff --git a/backend/src/sys/hash_map.hpp b/backend/src/sys/hash_map.hpp
new file mode 100644
index 0000000..fb1d1ef
--- /dev/null
+++ b/backend/src/sys/hash_map.hpp
@@ -0,0 +1,82 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file hash_map.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_HASH_MAP_HPP__
+#define __GBE_HASH_MAP_HPP__
+
+#include "sys/platform.hpp"
+
+#ifdef __MSVC__
+#include <unordered_map>
+#else
+#include <tr1/unordered_map>
+#endif /* __MSVC__ */
+
+namespace gbe
+{
+  /*! Add specific allocator to the hash map */
+  template <class Key,
+            class T,
+            class Hash = std::hash<Key>,
+            class Pred = std::equal_to<Key>>
+  class hash_map : public std::tr1::unordered_map<Key,T,Hash,Pred,Allocator<std::pair<const Key,T>>>,
+                   public NonCopyable
+  {
+  public:
+    // Typedefs
+    typedef std::pair<const Key, T> value_type;
+    typedef Allocator<value_type> allocator_type;
+    typedef std::tr1::unordered_map<Key,T,Hash,Pred,allocator_type> parent_type;
+    typedef typename allocator_type::size_type size_type;
+    typedef Key key_type;
+    typedef T mapped_type;
+    typedef Hash hasher;
+    typedef Pred key_equal;
+
+    /*! Default constructor */
+    INLINE explicit hash_map(size_type n = 3,
+                             const hasher& hf = hasher(),
+                             const key_equal& eql = key_equal(),
+                             const allocator_type& a = allocator_type()) :
+      parent_type(n, hf, eql, a) {}
+    /*! Iteration constructor */
+    template <class InputIterator>
+    INLINE hash_map(InputIterator first,
+                    InputIterator last,
+                    size_type n = 3,
+                    const hasher& hf = hasher(),
+                    const key_equal& eql = key_equal(),
+                    const allocator_type& a = allocator_type()) :
+      parent_type(first,last,n,hf,eql,a) {}
+#if 0
+    /*! Copy constructor */
+    INLINE hash_map(const hash_map &other) : parent_type(other) {}
+#endif
+    GBE_CLASS(hash_map);
+  };
+} /* namespace gbe */
+
+#endif /* __GBE_HASH_MAP_HPP__ */
+
diff --git a/backend/src/sys/intrinsics.hpp b/backend/src/sys/intrinsics.hpp
new file mode 100644
index 0000000..2e25dc7
--- /dev/null
+++ b/backend/src/sys/intrinsics.hpp
@@ -0,0 +1,209 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __GBE_INTRINSICS_HPP__
+#define __GBE_INTRINSICS_HPP__
+
+#include "sys/platform.hpp"
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#if defined(__MSVC__)
+
+#include <intrin.h>
+
+#define GBE_COMPILER_WRITE_BARRIER       _WriteBarrier()
+#define GBE_COMPILER_READ_WRITE_BARRIER  _ReadWriteBarrier()
+
+#if _MSC_VER >= 1400
+#pragma intrinsic(_ReadBarrier)
+#define GBE_COMPILER_READ_BARRIER        _ReadBarrier()
+#else
+#define GBE_COMPILER_READ_BARRIER        _ReadWriteBarrier()
+#endif /* _MSC_VER >= 1400 */
+
+INLINE int __bsf(int v) {
+  unsigned long r = 0; _BitScanForward(&r,v); return r;
+}
+
+INLINE int __bsr(int v) {
+  unsigned long r = 0; _BitScanReverse(&r,v); return r;
+}
+
+INLINE int __btc(int v, int i) {
+  long r = v; _bittestandcomplement(&r,i); return r;
+}
+
+INLINE int __bts(int v, int i) {
+  long r = v; _bittestandset(&r,i); return r;
+}
+
+INLINE int __btr(int v, int i) {
+  long r = v; _bittestandreset(&r,i); return r;
+}
+
+INLINE void memoryFence(void) { _mm_mfence(); }
+
+#if defined(__X86_64__) && !defined(__INTEL_COMPILER)
+
+INLINE size_t __bsf(size_t v) {
+  unsigned long r = 0; _BitScanForward64(&r,v); return r;
+}
+
+INLINE size_t __bsr(size_t v) {
+  unsigned long r = 0; _BitScanReverse64(&r,v); return r;
+}
+
+INLINE size_t __btc(size_t v, size_t i) {
+  __int64_t r = v; _bittestandcomplement64(&r,i); return r;
+}
+
+INLINE size_t __bts(size_t v, size_t i) {
+  __int64_t r = v; _bittestandset64(&r,i); return r;
+}
+
+INLINE size_t __btr(size_t v, size_t i) {
+  __int64_t r = v; _bittestandreset64(&r,i); return r;
+}
+
+#endif /* defined(__X86_64__) && !defined(__INTEL_COMPILER) */
+
+typedef int32_t atomic32_t;
+
+INLINE int32_t atomic_add(volatile int32_t* m, const int32_t v) {
+  return _InterlockedExchangeAdd((volatile long*)m,v);
+}
+
+INLINE int32_t atomic_cmpxchg(volatile int32_t* m, const int32_t v, const int32_t c) {
+  return _InterlockedCompareExchange((volatile long*)m,v,c);
+}
+
+#if defined(__X86_64__)
+
+typedef int64_t atomic_t;
+
+INLINE int64_t atomic_add(volatile int64_t* m, const int64_t v) {
+  return _InterlockedExchangeAdd64(m,v);
+}
+
+INLINE int64_t atomic_cmpxchg(volatile int64_t* m, const int64_t v, const int64_t c) {
+  return _InterlockedCompareExchange64(m,v,c);
+}
+
+#else
+
+typedef int32_t atomic_t;
+
+#endif /* defined(__X86_64__) */
+
+#else
+
+INLINE unsigned int __popcnt(unsigned int in) {
+  int r = 0; asm ("popcnt %1,%0" : "=r"(r) : "r"(in)); return r;
+}
+
+INLINE int __bsf(int v) {
+  int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+}
+
+INLINE int __bsr(int v) {
+  int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+}
+
+INLINE int __btc(int v, int i) {
+  int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+}
+
+INLINE int __bts(int v, int i) {
+  int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+}
+
+INLINE int __btr(int v, int i) {
+  int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+}
+
+INLINE size_t __bsf(size_t v) {
+  size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
+}
+
+INLINE size_t __bsr(size_t v) {
+  size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
+}
+
+INLINE size_t __btc(size_t v, size_t i) {
+  size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+}
+
+INLINE size_t __bts(size_t v, size_t i) {
+  size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+}
+
+INLINE size_t __btr(size_t v, size_t i) {
+  size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
+}
+
+INLINE void memoryFence(void) { _mm_mfence(); }
+
+typedef int32_t atomic32_t;
+
+INLINE int32_t atomic_add(int32_t volatile* value, int32_t input)
+{  asm volatile("lock xadd %0,%1" : "+r" (input), "+m" (*value) : "r" (input), "m" (*value)); return input; }
+
+INLINE int32_t atomic_cmpxchg(int32_t volatile* value, const int32_t input, int32_t comparand)
+{  asm volatile("lock cmpxchg %2,%0" : "=m" (*value), "=a" (comparand) : "r" (input), "m" (*value), "a" (comparand) : "flags"); return comparand; }
+
+#if defined(__X86_64__)
+
+  typedef int64_t atomic_t;
+
+  INLINE int64_t atomic_add(int64_t volatile* value, int64_t input)
+  {  asm volatile("lock xaddq %0,%1" : "+r" (input), "+m" (*value) : "r" (input), "m" (*value));  return input;  }
+
+  INLINE int64_t atomic_cmpxchg(int64_t volatile* value, const int64_t input, int64_t comparand)
+  {  asm volatile("lock cmpxchgq %2,%0" : "+m" (*value), "+a" (comparand) : "r" (input), "m" (*value), "r" (comparand) : "flags"); return comparand;  }
+
+#else
+
+  typedef int32_t atomic_t;
+
+#endif /* defined(__X86_64__) */
+
+#define GBE_COMPILER_READ_WRITE_BARRIER    asm volatile("" ::: "memory");
+#define GBE_COMPILER_WRITE_BARRIER         GBE_COMPILER_READ_WRITE_BARRIER
+#define GBE_COMPILER_READ_BARRIER          GBE_COMPILER_READ_WRITE_BARRIER
+
+#endif /* __MSVC__ */
+
+template <typename T>
+INLINE T __load_acquire(volatile T *ptr)
+{
+  GBE_COMPILER_READ_WRITE_BARRIER;
+  T x = *ptr; // for x86, load == load_acquire
+  GBE_COMPILER_READ_WRITE_BARRIER;
+  return x;
+}
+
+template <typename T>
+INLINE void __store_release(volatile T *ptr, T x)
+{
+  GBE_COMPILER_READ_WRITE_BARRIER;
+  *ptr = x; // for x86, store == store_release
+  GBE_COMPILER_READ_WRITE_BARRIER;
+}
+#endif /* __GBE_INTRINSICS_HPP__ */
+
diff --git a/backend/src/sys/intrusive_list.cpp b/backend/src/sys/intrusive_list.cpp
new file mode 100644
index 0000000..ed7067c
--- /dev/null
+++ b/backend/src/sys/intrusive_list.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2007 Maciej Sinilo
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "intrusive_list.hpp"
+
+namespace gbe
+{
+  intrusive_list_base::intrusive_list_base() : m_root() {}
+
+  intrusive_list_base::size_type intrusive_list_base::size() const {
+    size_type numNodes(0);
+    const intrusive_list_node* iter = &m_root;
+    do {
+      iter = iter->next;
+      ++numNodes;
+    } while (iter != &m_root);
+    return numNodes - 1;
+  }
+
+  void append(intrusive_list_node *node, intrusive_list_node *prev) {
+    GBE_ASSERT(!node->in_list());
+    node->next = prev->next;
+    node->next->prev = node;
+    prev->next = node;
+    node->prev = prev;
+  }
+
+  void prepend(intrusive_list_node *node, intrusive_list_node *next) {
+    GBE_ASSERT(!node->in_list());
+    node->prev = next->prev;
+    node->prev->next = node;
+    next->prev = node;
+    node->next = next;
+  }
+
+  void link(intrusive_list_node* node, intrusive_list_node* nextNode) {
+    prepend(node, nextNode);
+  }
+
+  void unlink(intrusive_list_node* node) {
+    GBE_ASSERT(node->in_list());
+    node->prev->next = node->next;
+    node->next->prev = node->prev;
+    node->next = node->prev = node;
+  }
+} /* namespace gbe */
+
diff --git a/backend/src/sys/intrusive_list.hpp b/backend/src/sys/intrusive_list.hpp
new file mode 100644
index 0000000..2e2f2a9
--- /dev/null
+++ b/backend/src/sys/intrusive_list.hpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2007 Maciej Sinilo
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __GBE_INTRUSIVE_LIST_HPP__
+#define __GBE_INTRUSIVE_LIST_HPP__
+
+#include "sys/platform.hpp"
+
+namespace gbe
+{
+  /*! List elements must inherit from it */
+  struct intrusive_list_node
+  {
+    INLINE intrusive_list_node(void) { next = prev = this; }
+    INLINE bool in_list(void) const  { return this != next; }
+    intrusive_list_node *next;
+    intrusive_list_node *prev;
+  };
+
+  /*! Insert node such that prev -> node */
+  void append(intrusive_list_node *node, intrusive_list_node *prev);
+  /*! Insert node such that node -> next */
+  void prepend(intrusive_list_node *node, intrusive_list_node *next);
+  /*! Same as prepend */
+  void link(intrusive_list_node* node, intrusive_list_node* nextNode);
+  /*! Remove the node from its current list */
+  void unlink(intrusive_list_node* node);
+
+  template<typename Pointer, typename Reference>
+  class intrusive_list_iterator
+  {
+  public:
+    typedef Pointer pointer;
+    typedef Reference reference;
+
+    INLINE intrusive_list_iterator(void): m_node(0) {}
+    INLINE intrusive_list_iterator(Pointer iterNode) : m_node(iterNode) {}
+
+    INLINE Reference operator*(void) const {
+      GBE_ASSERT(m_node);
+      return *m_node;
+    }
+    INLINE Pointer operator->(void) const { return m_node; }
+    INLINE Pointer node(void) const { return m_node; }
+
+    INLINE intrusive_list_iterator& operator++(void) {
+      m_node = static_cast<Pointer>(m_node->next);
+      return *this;
+    }
+    INLINE intrusive_list_iterator& operator--(void) {
+      m_node = static_cast<Pointer>(m_node->prev);
+      return *this;
+    }
+    INLINE intrusive_list_iterator operator++(int) {
+      intrusive_list_iterator copy(*this);
+      ++(*this);
+      return copy;
+    }
+    INLINE intrusive_list_iterator operator--(int) {
+      intrusive_list_iterator copy(*this);
+      --(*this);
+      return copy;
+    }
+
+    INLINE bool operator== (const intrusive_list_iterator& rhs) const {
+      return rhs.m_node == m_node;
+    }
+    INLINE bool operator!= (const intrusive_list_iterator& rhs) const {
+      return !(rhs == *this);
+    }
+  private:
+    Pointer m_node;
+  };
+
+  class intrusive_list_base
+  {
+  public:
+    typedef size_t size_type;
+
+    INLINE void pop_back(void) { unlink(m_root.prev); }
+    INLINE void pop_front(void) { unlink(m_root.next); }
+    INLINE bool empty(void) const  { return !m_root.in_list(); }
+    size_type size(void) const;
+
+  protected:
+    intrusive_list_base(void);
+    INLINE ~intrusive_list_base(void) {}
+
+    intrusive_list_node m_root;
+
+  private:
+    intrusive_list_base(const intrusive_list_base&);
+    intrusive_list_base& operator=(const intrusive_list_base&);
+  };
+
+  template<class T>
+  class intrusive_list : public intrusive_list_base
+  {
+  public:
+    typedef T node_type;
+    typedef T value_type;
+    typedef intrusive_list_iterator<T*, T&> iterator;
+    typedef intrusive_list_iterator<const T*, const T&> const_iterator;
+
+    intrusive_list(void) : intrusive_list_base() {
+      intrusive_list_node* testNode((T*)0);
+      static_cast<void>(sizeof(testNode));
+    }
+
+    void push_back(value_type* v) { link(v, &m_root); }
+    void push_front(value_type* v) { link(v, m_root.next); }
+
+    iterator begin(void)  { return iterator(upcast(m_root.next)); }
+    iterator end(void)    { return iterator(upcast(&m_root)); }
+    iterator rbegin(void) { return iterator(upcast(m_root.prev)); }
+    iterator rend(void)   { return iterator(upcast(&m_root)); }
+    const_iterator begin(void) const  { return const_iterator(upcast(m_root.next)); }
+    const_iterator end(void) const    { return const_iterator(upcast(&m_root)); }
+    const_iterator rbegin(void) const { return const_iterator(upcast(m_root.prev)); }
+    const_iterator rend(void) const   { return const_iterator(upcast(&m_root)); }
+
+    INLINE value_type* front(void) { return upcast(m_root.next); }
+    INLINE value_type* back(void)  { return upcast(m_root.prev); }
+    INLINE const value_type* front(void) const { return upcast(m_root.next); }
+    INLINE const value_type* back(void) const  { return upcast(m_root.prev); }
+
+    iterator insert(iterator pos, value_type* v) {
+      link(v, pos.node());
+      return iterator(v);
+    }
+    iterator erase(iterator it) {
+      iterator itErase(it);
+      ++it;
+      unlink(itErase.node());
+      return it;
+    }
+    iterator erase(iterator first, iterator last) {
+      while (first != last) first = erase(first);
+      return first;
+    }
+
+    void clear(void) { erase(begin(), end()); }
+    void fast_clear(void) { m_root.next = m_root.prev = &m_root; }
+    static void remove(value_type* v) { unlink(v); }
+
+  private:
+    static INLINE node_type* upcast(intrusive_list_node* n) {
+      return static_cast<node_type*>(n);
+    }
+    static INLINE const node_type* upcast(const intrusive_list_node* n) {
+      return static_cast<const node_type*>(n);
+    }
+  };
+} /* namespace gbe */
+
+#endif /* __GBE_INTRUSIVE_LIST_HPP__ */
+
diff --git a/backend/src/sys/list.hpp b/backend/src/sys/list.hpp
new file mode 100644
index 0000000..51b9c39
--- /dev/null
+++ b/backend/src/sys/list.hpp
@@ -0,0 +1,65 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file list.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_LIST_HPP__
+#define __GBE_LIST_HPP__
+
+#include "sys/platform.hpp"
+#include <list>
+
+namespace gbe
+{
+  /*! Use custom allocator instead of std one */
+  template <typename T>
+  class list : public std::list<T, Allocator<T>>
+  {
+  public:
+    // Typedefs
+    typedef T value_type;
+    typedef Allocator<value_type> allocator_type;
+    typedef std::list<T, allocator_type> parent_type;
+    typedef typename allocator_type::size_type size_type;
+
+    /*! Default constructor */
+    INLINE explicit list(const allocator_type &a = allocator_type()) :
+      parent_type(a) {}
+    /*! Repetitive constructor */
+    INLINE explicit list(size_type n,
+                         const T &value = T(),
+                         const allocator_type &a = allocator_type()) :
+      parent_type(n, value, a) {}
+    /*! Iteration constructor */
+    template <class InputIterator>
+    INLINE list(InputIterator first,
+                InputIterator last,
+                const allocator_type &a = allocator_type()) :
+      parent_type(first, last, a) {}
+    /*! Copy constructor */
+    INLINE list(const list &x) : parent_type(x) {}
+    GBE_CLASS(list);
+  };
+} /* namespace gbe */
+
+#endif /* __GBE_LIST_HPP__ */
+
diff --git a/backend/src/sys/map.hpp b/backend/src/sys/map.hpp
new file mode 100644
index 0000000..1c72400
--- /dev/null
+++ b/backend/src/sys/map.hpp
@@ -0,0 +1,75 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file map.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_MAP_HPP__
+#define __GBE_MAP_HPP__
+
+#include "sys/platform.hpp"
+#include <map>
+
+namespace gbe
+{
+  /*! Use custom allocator instead of std one */
+  template<class Key, class T, class Pred = std::less<Key>>
+  class map : public std::map<Key,T,Pred,Allocator<std::pair<const Key, T>>>,
+              public NonCopyable
+  {
+  public:
+    // Typedefs
+    typedef std::pair<const Key, T> value_type;
+    typedef Allocator<value_type> allocator_type;
+    typedef std::map<Key,T,Pred,allocator_type> parent_type;
+    typedef Key key_type;
+    typedef T mapped_type;
+    typedef Pred key_compare;
+    typedef typename allocator_type::pointer pointer;
+    typedef typename allocator_type::const_pointer const_pointer;
+    typedef typename allocator_type::reference reference;
+    typedef typename allocator_type::const_reference const_reference;
+
+    /*! Default constructor */
+    INLINE map(const key_compare &comp = key_compare(),
+               const allocator_type &a = allocator_type()) :
+      parent_type(comp, a) {}
+    /*! Iteration constructor */
+    template<class InputIterator>
+    INLINE map(InputIterator first,
+               InputIterator last,
+               const key_compare &comp = key_compare(),
+               const allocator_type& a = allocator_type()) :
+      parent_type(first, last, comp, a) {}
+#if 0
+    /*! Copy constructor */
+    INLINE map(const map& x) : parent_type(x) {}
+#endif
+    /*! Better than using find if we do not care about the iterator itself */
+    INLINE bool contains(const Key &key) const {
+      return this->find(key) != this->end();
+    }
+    GBE_CLASS(map);
+  };
+} /* namespace gbe */
+
+#endif /* __GBE_MAP_HPP__ */
+
diff --git a/backend/src/sys/mutex.cpp b/backend/src/sys/mutex.cpp
new file mode 100644
index 0000000..9640150
--- /dev/null
+++ b/backend/src/sys/mutex.cpp
@@ -0,0 +1,48 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "sys/mutex.hpp"
+
+#if defined(__WIN32__)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace gbe
+{
+  /*! system mutex using windows API */
+  MutexSys::MutexSys( void ) { mutex = new CRITICAL_SECTION; InitializeCriticalSection((CRITICAL_SECTION*)mutex); }
+  MutexSys::~MutexSys( void ) { DeleteCriticalSection((CRITICAL_SECTION*)mutex); delete ((CRITICAL_SECTION*)mutex); }
+  void MutexSys::lock( void ) { EnterCriticalSection((CRITICAL_SECTION*)mutex); }
+  void MutexSys::unlock( void ) { LeaveCriticalSection((CRITICAL_SECTION*)mutex); }
+}
+#endif
+
+#if defined(__UNIX__)
+#include <pthread.h>
+
+namespace gbe
+{
+  /*! system mutex using pthreads */
+  MutexSys::MutexSys( void ) { mutex = new pthread_mutex_t; pthread_mutex_init((pthread_mutex_t*)mutex, NULL); }
+  MutexSys::~MutexSys( void ) { pthread_mutex_destroy((pthread_mutex_t*)mutex); delete ((pthread_mutex_t*)mutex); }
+  void MutexSys::lock( void ) { pthread_mutex_lock((pthread_mutex_t*)mutex); }
+  void MutexSys::unlock( void ) { pthread_mutex_unlock((pthread_mutex_t*)mutex); }
+}
+#endif
+
diff --git a/backend/src/sys/mutex.hpp b/backend/src/sys/mutex.hpp
new file mode 100644
index 0000000..1a462b0
--- /dev/null
+++ b/backend/src/sys/mutex.hpp
@@ -0,0 +1,74 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __GBE_MUTEX_HPP__
+#define __GBE_MUTEX_HPP__
+
+#include "platform.hpp"
+#include "atomic.hpp"
+#include <xmmintrin.h>
+
+namespace gbe
+{
+  class MutexSys {
+    friend class ConditionSys;
+  public:
+    MutexSys(void);
+    ~MutexSys(void);
+    void lock(void);
+    void unlock(void);
+  protected:
+    void* mutex;
+    MutexSys(const MutexSys&); // don't implement
+    MutexSys& operator= (const MutexSys&); // don't implement
+    GBE_CLASS(MutexSys);
+  };
+
+  /*! active mutex */
+  class MutexActive {
+  public:
+    INLINE MutexActive(void) : _lock(LOCK_IS_FREE) {}
+    INLINE void lock(void) {
+      GBE_COMPILER_READ_BARRIER;
+      while (cmpxchg(_lock, LOCK_IS_TAKEN, LOCK_IS_FREE) != LOCK_IS_FREE)
+        _mm_pause();
+      GBE_COMPILER_READ_BARRIER;
+    }
+    INLINE void unlock(void) { _lock.storeRelease(LOCK_IS_FREE); }
+  protected:
+    enum { LOCK_IS_FREE = 0, LOCK_IS_TAKEN = 1 };
+    Atomic _lock;
+    MutexActive(const MutexActive&); // don't implement
+    MutexActive& operator=(const MutexActive&); // don't implement
+    GBE_CLASS(MutexActive);
+  };
+
+  /*! safe mutex lock and unlock helper */
+  template<typename Mutex> class Lock {
+  public:
+    Lock (Mutex& mutex) : mutex(mutex) { mutex.lock(); }
+    ~Lock() { mutex.unlock(); }
+  protected:
+    Mutex& mutex;
+    Lock(const Lock&); // don't implement
+    Lock& operator= (const Lock&); // don't implement
+    GBE_CLASS(Lock);
+  };
+}
+
+#endif /* __GBE_MUTEX_HPP__ */
diff --git a/backend/src/sys/platform.cpp b/backend/src/sys/platform.cpp
new file mode 100644
index 0000000..95768ee
--- /dev/null
+++ b/backend/src/sys/platform.cpp
@@ -0,0 +1,79 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "sys/platform.hpp"
+#include "sys/intrinsics.hpp"
+#include <string>
+
+////////////////////////////////////////////////////////////////////////////////
+/// Windows Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __WIN32__
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+namespace gbe
+{
+  double getSeconds() {
+    LARGE_INTEGER freq, val;
+    QueryPerformanceFrequency(&freq);
+    QueryPerformanceCounter(&val);
+    return (double)val.QuadPart / (double)freq.QuadPart;
+  }
+
+  void FATAL(const std::string &msg) {
+    std::cerr << msg << std::endl;
+    MessageBox(NULL, msg.c_str(), "Fatal Error", MB_OK | MB_ICONEXCLAMATION);
+    GBE_ASSERT(0);
+#ifdef __GNUC__
+    exit(-1);
+#else
+    _exit(-1);
+#endif /* __GNUC__ */
+  }
+
+} /* namespace gbe */
+#endif /* __WIN32__ */
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unix Platform
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__UNIX__)
+
+#include <sys/time.h>
+#include <unistd.h>
+
+namespace gbe
+{
+  double getSeconds() {
+    struct timeval tp; gettimeofday(&tp,NULL);
+    return double(tp.tv_sec) + double(tp.tv_usec)/1E6;
+  }
+
+  void FATAL(const std::string &msg) {
+    std::cerr << msg << std::endl;
+    GBE_ASSERT(0);
+    _exit(-1);
+  }
+} /* namespace gbe */
+
+#endif /* __UNIX__ */
+
diff --git a/backend/src/sys/platform.hpp b/backend/src/sys/platform.hpp
new file mode 100644
index 0000000..b8a2841
--- /dev/null
+++ b/backend/src/sys/platform.hpp
@@ -0,0 +1,441 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_PLATFORM_HPP__
+#define __GBE_PLATFORM_HPP__
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <ostream>
+#include <istream>
+#include <string>
+#include <cassert>
+#include <new>
+
+////////////////////////////////////////////////////////////////////////////////
+/// CPU architecture
+////////////////////////////////////////////////////////////////////////////////
+
+/* detect 32 or 64 platform */
+#if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#define __X86_64__
+#else
+#define __X86__
+#endif
+
+/* We require SSE ... */
+#ifndef __SSE__
+#define __SSE__
+#endif
+
+/* ... and SSE2 */
+#ifndef __SSE2__
+#define __SSE2__
+#endif
+
+#if defined(_INCLUDED_IMM)
+// #define __AVX__
+#endif
+
+#if defined(_MSC_VER) && (_MSC_VER < 1600) && !defined(__INTEL_COMPILER) || defined(_DEBUG) && defined(_WIN32)
+#define __NO_AVX__
+#endif
+
+#if defined(_MSC_VER) && !defined(__SSE4_2__)
+// #define __SSE4_2__  //! activates SSE4.2 support
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Operating system
+////////////////////////////////////////////////////////////////////////////////
+
+/* detect Linux platform */
+#if defined(linux) || defined(__linux__) || defined(__LINUX__)
+#  if !defined(__LINUX__)
+#     define __LINUX__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect FreeBSD platform */
+#if defined(__FreeBSD__) || defined(__FREEBSD__)
+#  if !defined(__FREEBSD__)
+#     define __FREEBSD__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect Windows 95/98/NT/2000/XP/Vista/7 platform */
+#if (defined(WIN32) || defined(_WIN32) || defined(__WIN32__) || defined(__NT__)) && !defined(__CYGWIN__)
+#  if !defined(__WIN32__)
+#     define __WIN32__
+#  endif
+#endif
+
+/* detect Cygwin platform */
+#if defined(__CYGWIN__)
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* detect MAC OS X platform */
+#if defined(__APPLE__) || defined(MACOSX) || defined(__MACOSX__)
+#  if !defined(__MACOSX__)
+#     define __MACOSX__
+#  endif
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+/* try to detect other Unix systems */
+#if defined(__unix__) || defined (unix) || defined(__unix) || defined(_unix)
+#  if !defined(__UNIX__)
+#     define __UNIX__
+#  endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Compiler
+////////////////////////////////////////////////////////////////////////////////
+
+/*! GCC compiler */
+#ifdef __GNUC__
+// #define __GNUC__
+#endif
+
+/*! Intel compiler */
+#ifdef __INTEL_COMPILER
+#define __ICC__
+#endif
+
+/*! Visual C compiler */
+#ifdef _MSC_VER
+#define __MSVC__
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Makros
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __WIN32__
+#define __dllexport extern "C" __declspec(dllexport)
+#define __dllimport extern "C" __declspec(dllimport)
+#else
+#define __dllexport extern "C"
+#define __dllimport extern "C"
+#endif
+
+#ifdef __MSVC__
+#undef NOINLINE
+#define NOINLINE             __declspec(noinline)
+#define INLINE               __forceinline
+#define RESTRICT             __restrict
+#define THREAD               __declspec(thread)
+#define ALIGNED(...)         __declspec(align(__VA_ARGS__))
+//#define __FUNCTION__           __FUNCTION__
+#define DEBUGBREAK()         __debugbreak()
+#else
+#undef NOINLINE
+#undef INLINE
+#define NOINLINE        __attribute__((noinline))
+#define INLINE          inline __attribute__((always_inline))
+#define RESTRICT        __restrict
+#define THREAD          __thread
+#define ALIGNED(...)    __attribute__((aligned(__VA_ARGS__)))
+#define __FUNCTION__    __PRETTY_FUNCTION__
+#define DEBUGBREAK()    asm ("int $3")
+#endif
+
+/*! Modern x86 processors */
+#define CACHE_LINE 64
+#define CACHE_LINE_ALIGNED ALIGNED(CACHE_LINE)
+
+#ifdef __GNUC__
+  #define MAYBE_UNUSED __attribute__((used))
+#else
+  #define MAYBE_UNUSED
+#endif
+
+#if defined(_MSC_VER)
+#define __builtin_expect(expr,b) expr
+#endif
+
+/*! Debug syntactic sugar */
+#if GBE_DEBUG
+#define IF_DEBUG(EXPR) EXPR
+#else
+#define IF_DEBUG(EXPR)
+#endif /* GBE_DEBUG */
+
+/*! Debug printing macros */
+#define STRING(x) #x
+#define PING std::cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << std::endl
+#define PRINT(x) std::cout << STRING(x) << " = " << (x) << std::endl
+
+/*! Branch hint */
+#define LIKELY(x)       __builtin_expect(!!(x),1)
+#define UNLIKELY(x)     __builtin_expect((x),0)
+
+/*! Stringify macros */
+#define JOIN(X, Y) _DO_JOIN(X, Y)
+#define _DO_JOIN(X, Y) _DO_JOIN2(X, Y)
+#define _DO_JOIN2(X, Y) X##Y
+
+/*! Run-time assertion */
+#if GBE_DEBUG
+#define GBE_ASSERT(EXPR) do { \
+  if (UNLIKELY(!(EXPR))) \
+    gbe::onFailedAssertion(#EXPR, __FILE__, __FUNCTION__, __LINE__); \
+} while (0)
+#define GBE_ASSERTM(EXPR, MSG) do { \
+  if (UNLIKELY(!(EXPR))) \
+    gbe::onFailedAssertion(MSG, __FILE__, __FUNCTION__, __LINE__); \
+} while (0)
+#else
+#define GBE_ASSERT(EXPR) do { } while (0)
+#define GBE_ASSERTM(EXPR, MSG) do { } while (0)
+#endif /* GBE_DEBUG */
+
+#define NOT_IMPLEMENTED GBE_ASSERTM (false, "Not implemented")
+#define NOT_SUPPORTED GBE_ASSERTM (false, "Not supported")
+
+/*! Fatal error macros */
+#define FATAL_IF(COND, MSG) \
+do { \
+  if(UNLIKELY(COND)) FATAL(MSG); \
+} while (0)
+
+/* Safe deletion macros */
+#define GBE_SAFE_DELETE_ARRAY(x) do { if (x != NULL) GBE_DELETE_ARRAY(x); } while (0)
+#define GBE_SAFE_DELETE(x) do { if (x != NULL) GBE_DELETE(x); } while (0)
+
+/* Number of elements in an array */
+#define ARRAY_ELEM_NUM(x) (sizeof(x) / sizeof(x[0]))
+
+/* Align X on A */
+#define ALIGN(X,A) (((X) % (A)) ? ((X) + (A) - ((X) % (A))) : (X))
+
+/*! Produce a string from the macro locatiom */
+#define HERE (STRING(__LINE__) "@" __FILE__)
+
+/*! Typesafe encapusalation of a type (mostly for integers) */
+#define TYPE_SAFE(SAFE, UNSAFE) \
+class SAFE \
+{ \
+public: \
+  INLINE SAFE(void) {} \
+  explicit INLINE SAFE(uint16_t unsafe) : unsafe(unsafe) {} \
+  INLINE operator UNSAFE (void) const { return unsafe; } \
+  UNSAFE value(void) const { return unsafe; } \
+private: \
+  UNSAFE unsafe; \
+};
+
+/*! Default alignment for the platform */
+#define GBE_DEFAULT_ALIGNMENT 16
+
+/*! Useful constants */
+#define KB 1024
+#define MB (KB*KB)
+
+/*! Portable AlignOf */
+template <typename T>
+struct AlignOf {
+  struct Helper { char x; T t; };
+  enum { value = offsetof(Helper, t) };
+};
+
+//gcc 4.8+ support C++11 alignof keyword
+#if (__GNUC__ >= 4 && __GNUC_MINOR__ >= 8)
+#define ALIGNOF(T) (alignof(T))
+#else
+#define ALIGNOF(T) (AlignOf<T>::value)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Visibility parameters (DLL export and so on)
+////////////////////////////////////////////////////////////////////////////////
+#if defined __WIN32__
+  #if defined __GNUC__
+    #define GBE_EXPORT_SYMBOL __attribute__ ((dllexport))
+    #define GBE_IMPORT_SYMBOL __attribute__ ((dllimport))
+  #else
+    #define GBE_IMPORT_SYMBOL __declspec(dllimport)
+    #define GBE_EXPORT_SYMBOL __declspec(dllexport)
+  #endif /* __GNUC__ */
+#else
+  #define GBE_EXPORT_SYMBOL __attribute__ ((visibility ("default")))
+  #define GBE_IMPORT_SYMBOL
+#endif /* __WIN32__ */
+
+////////////////////////////////////////////////////////////////////////////////
+/// Basic Types
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__MSVC__)
+typedef          __int64_t  int64_t;
+typedef unsigned __int64_t uint64_t;
+typedef          __int32_t  int32_t;
+typedef unsigned __int32_t uint32_t;
+typedef          __int16_t  int16_t;
+typedef unsigned __int16_t uint16_t;
+typedef          __int8_t    int8_t;
+typedef unsigned __int8_t   uint8_t;
+#else
+#include <cstdint>
+#endif
+
+#if defined(__X86_64__)
+typedef int64_t index_t;
+#else
+typedef int32_t index_t;
+#endif
+
+/*! To protect some classes from being copied */
+class NonCopyable
+{
+protected:
+  INLINE NonCopyable(void) {}
+  INLINE ~NonCopyable(void) {}
+private: 
+  INLINE NonCopyable(const NonCopyable&) {}
+  INLINE NonCopyable& operator= (const NonCopyable&) {return *this;}
+};
+
+#define TO_MAGIC(A, B, C, D)  (A<<24 | B<<16 | C<<8 | D)
+
+class Serializable
+{
+public:
+  INLINE Serializable(void) = default;
+  INLINE Serializable(const Serializable&) = default;
+  INLINE Serializable& operator= (const Serializable&) = default;
+
+  virtual size_t serializeToBin(std::ostream& outs) = 0;
+  virtual size_t deserializeFromBin(std::istream& ins) = 0;
+
+  /* These two will follow LLVM's ABI. */
+  virtual size_t serializeToLLVM(void) { return 0;/* not implemented now. */}
+  virtual size_t deserializeFromLLVM(void) { return 0;/* not implemented now. */}
+
+  virtual void printStatus(int indent = 0, std::ostream& outs = std::cout) { }
+
+  virtual ~Serializable(void) { }
+
+protected:
+  static std::string indent_to_str(int indent) {
+    std::string ind(indent, ' ');
+    return ind;
+  }
+};
+
+/* Help Macro for serialization. */
+#define SERIALIZE_OUT(elt, out, sz)			\
+     do {						\
+	  auto tmp_val = elt;				\
+	  out.write((char *)(&tmp_val), sizeof(elt));	\
+	  sz += sizeof(elt);				\
+     } while(0)
+
+#define DESERIALIZE_IN(elt, in, sz)			\
+     do {						\
+	  in.read((char *)(&(elt)), sizeof(elt));	\
+	  sz += sizeof(elt);				\
+     } while(0)
+
+////////////////////////////////////////////////////////////////////////////////
+/// Disable some compiler warnings
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef __ICC__
+#pragma warning(disable:265)  // floating-point operation result is out of range
+#pragma warning(disable:383)  // value copied to temporary, reference to temporary used
+#pragma warning(disable:869)  // parameter was never referenced
+#pragma warning(disable:981)  // operands are evaluated in unspecified order
+#pragma warning(disable:1418) // external function definition with no prior declaration
+#pragma warning(disable:1419) // external declaration in primary source file
+#pragma warning(disable:1572) // floating-point equality and inequality comparisons are unreliable
+#pragma warning(disable:1125) // virtual function override intended?
+#endif /* __ICC__ */
+
+////////////////////////////////////////////////////////////////////////////////
+/// Default Includes and Functions
+////////////////////////////////////////////////////////////////////////////////
+
+#include "sys/alloc.hpp"
+
+namespace gbe
+{
+  /*! selects */
+  INLINE bool  select(bool s, bool  t , bool f) { return s ? t : f; }
+  INLINE int   select(bool s, int   t,   int f) { return s ? t : f; }
+  INLINE float select(bool s, float t, float f) { return s ? t : f; }
+
+  /*! Fatal error function */
+  void FATAL(const std::string&);
+
+  /*! Return the next power of 2 */
+  INLINE uint32_t nextHighestPowerOf2(uint32_t x) {
+    x--;
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+    return ++x;
+  }
+
+  INLINE uint32_t logi2(uint32_t x) {
+    uint32_t r = 0;
+    while(x >>= 1) r++;
+    return r;
+  }
+
+  template<uint32_t N>
+  INLINE uint32_t isPowerOf(uint32_t i) {
+    while (i > 1) {
+      if (i%N) return false;
+      i = i/N;
+    }
+    return true;
+  }
+  template<> INLINE uint32_t isPowerOf<2>(uint32_t i) { return ((i-1)&i) == 0; }
+
+  /*! random functions */
+  template<typename T> T     random() { return T(0); }
+  template<> INLINE int32_t  random() { return int(rand()); }
+  template<> INLINE uint32_t random() { return uint32_t(rand()); }
+  template<> INLINE float    random() { return random<uint32_t>()/float(RAND_MAX); }
+  template<> INLINE double   random() { return random<uint32_t>()/double(RAND_MAX); }
+
+  /** returns performance counter in seconds */
+  double getSeconds();
+
+} /* namespace gbe */
+
+#endif /* __GBE_PLATFORM_HPP__ */
+
diff --git a/backend/src/sys/set.hpp b/backend/src/sys/set.hpp
new file mode 100644
index 0000000..db68807
--- /dev/null
+++ b/backend/src/sys/set.hpp
@@ -0,0 +1,70 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file set.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __GBE_SET_HPP__
+#define __GBE_SET_HPP__
+
+#include "sys/platform.hpp"
+#include <set>
+
+namespace gbe
+{
+  /*! Add our custom allocator to std::set */
+  template<class Key, class Pred = std::less<Key>>
+  class set : public std::set<Key,Pred,Allocator<Key>>, public NonCopyable
+  {
+  public:
+    // Typedefs
+    typedef Key value_type;
+    typedef Allocator<value_type> allocator_type;
+    typedef std::set<Key,Pred,Allocator<Key>> parent_type;
+    typedef Key key_type;
+    typedef Pred key_compare;
+
+    /*! Default constructor */
+    INLINE set(const key_compare &comp = key_compare(),
+               const allocator_type &a = allocator_type()) :
+      parent_type(comp, a) {}
+    /*! Iteration constructor */
+    template<class InputIterator>
+    INLINE set(InputIterator first,
+               InputIterator last,
+               const key_compare &comp = key_compare(),
+               const allocator_type& a = allocator_type()) :
+      parent_type(first, last, comp, a) {}
+#if 0
+    /*! Copy constructor */
+    INLINE set(const set& x) : parent_type(x) {}
+#endif
+    /*! Better than using find if we do not care about the iterator itself */
+    INLINE bool contains(const Key &key) const {
+      return this->find(key) != this->end();
+    }
+    GBE_CLASS(set);
+  };
+
+} /* namespace gbe */
+
+#endif /* __GBE_SET_HPP__ */
+
diff --git a/backend/src/sys/vector.hpp b/backend/src/sys/vector.hpp
new file mode 100644
index 0000000..dc89991
--- /dev/null
+++ b/backend/src/sys/vector.hpp
@@ -0,0 +1,79 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file vector.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __GBE_VECTOR_HPP__
+#define __GBE_VECTOR_HPP__
+
+#include "sys/platform.hpp"
+#include <vector>
+
+namespace gbe
+{
+  /*! Add bound checks to the standard vector class and use the internal
+   *  allocator
+   */
+  template<class T>
+  class vector : public std::vector<T, Allocator<T>>
+  {
+  public:
+    // Typedefs
+    typedef std::vector<T, Allocator<T>>       parent_type;
+    typedef Allocator<T>                       allocator_type;
+    typedef typename allocator_type::size_type size_type;
+    typedef typename parent_type::iterator     iterator;
+
+    /*! Default constructor */
+    INLINE explicit vector(const allocator_type &a = allocator_type()) :
+      parent_type(a) {}
+#if 0
+    /*! Copy constructor */
+    INLINE vector(const vector &x) : parent_type(x) {}
+#endif
+    /*! Repetitive sequence constructor */
+    INLINE explicit vector(size_type n,
+                           const T& value= T(),
+                           const allocator_type &a = allocator_type()) :
+      parent_type(n, value, a) {}
+    /*! Iteration constructor */
+    template <class InputIterator>
+    INLINE vector(InputIterator first,
+                  InputIterator last,
+                  const allocator_type &a = allocator_type()) :
+      parent_type(first, last, a) {}
+    /*! Get element at position index (with a bound check) */
+    T &operator[] (size_t index) {
+      GBE_ASSERT(index < this->size());
+      return parent_type::operator[] (index);
+    }
+    /*! Get element at position index (with a bound check) */
+    const T &operator[] (size_t index) const {
+      GBE_ASSERT(index < this->size());
+      return parent_type::operator[] (index);
+    }
+    GBE_CLASS(vector);
+  };
+} /* namespace gbe */
+
+#endif /* __GBE_VECTOR_HPP__ */
+
diff --git a/backend/src/update.sh b/backend/src/update.sh
new file mode 100755
index 0000000..0e5f8c0
--- /dev/null
+++ b/backend/src/update.sh
@@ -0,0 +1,3 @@
+#! /bin/sh -e
+./update_as.sh
+./update_convert.sh
diff --git a/backend/src/update_as.sh b/backend/src/update_as.sh
new file mode 100755
index 0000000..c68e789
--- /dev/null
+++ b/backend/src/update_as.sh
@@ -0,0 +1,11 @@
+#! /bin/sh -e
+
+AS_HEADER=ocl_as.h
+
+exec >$AS_HEADER.tmp
+echo "// This file is autogenerated by gen_as.sh."
+echo "// Don't modify it manually."
+./gen_as.sh
+exec >&2
+
+mv $AS_HEADER.tmp $AS_HEADER
diff --git a/backend/src/update_blob_ocl_header.py b/backend/src/update_blob_ocl_header.py
new file mode 100755
index 0000000..50f2501
--- /dev/null
+++ b/backend/src/update_blob_ocl_header.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2012 Intel Corporation
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library. If not, see <http://www.gnu.org/licenses/>.
+#
+# Author: Zhigang Gong <zhigang.gong at linux.intel.com>
+#/
+import sys
+import os
+
+if len(sys.argv) != 3:
+    print "Invalid argument {0}".format(sys.argv)
+    print "use {0} tmpl_file_name output_file_name".format(sys.argv[0])
+    raise
+
+def safeUnlink(filename):
+    try:
+        os.remove(filename)
+    except OSError:
+        pass
+
+header_segments = [ "vector", "as", "convert", "common_defines"]
+blobFileName = sys.argv[2]
+blobTempName = sys.argv[2] + '.tmp'
+safeUnlink(blobFileName)
+tmplFile = open(sys.argv[1], 'r')
+blob = open(blobTempName, 'w')
+path = os.path.dirname(sys.argv[1])
+if path == '':
+    path = '.'
+
+matched_header = ""
+for tline in tmplFile:
+    if matched_header == "":
+        blob.write(tline)
+        for header in header_segments:
+            if tline.strip() == '// ##BEGIN_{0}##'.format(header.upper()) :
+                hFile = open(path + '/ocl_' + header + '.h', 'r')
+                lineNr = 0
+                for hline in hFile:
+                    if lineNr >= 2:  #ignore the 2 lines of comment at the top of file.
+                        blob.write(hline)
+                    lineNr += 1
+                hFile.close()
+                matched_header = header
+    else:
+        if tline.strip() == '// ##END_{0}##'.format(matched_header.upper()) :
+            blob.write(tline)
+            matched_header = "";
+
+tmplFile.close()
+blob.close()
+os.rename(blobTempName, blobFileName)
diff --git a/backend/src/update_convert.sh b/backend/src/update_convert.sh
new file mode 100755
index 0000000..3c47917
--- /dev/null
+++ b/backend/src/update_convert.sh
@@ -0,0 +1,12 @@
+#! /bin/sh -e
+
+CONVERT_HEADER=ocl_convert.h
+
+
+exec >$CONVERT_HEADER.tmp
+echo "// This file is autogenerated by gen_convert.sh."
+echo "// Don't modify it manually."
+./gen_convert.sh
+exec >&2
+
+mv $CONVERT_HEADER.tmp $CONVERT_HEADER
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
new file mode 100644
index 0000000..d96a2e0
--- /dev/null
+++ b/benchmark/CMakeLists.txt
@@ -0,0 +1,21 @@
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../utests
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../include)
+
+
+link_directories (${LLVM_LIBRARY_DIR} ${DRM_LIBDIR})
+set (benchmark_sources
+  ../utests/utest_error.c
+  ../utests/utest_assert.cpp
+  ../utests/utest.cpp
+  ../utests/utest_file_map.cpp
+  ../utests/utest_helper.cpp
+  enqueue_copy_buf.cpp)
+
+ADD_LIBRARY(benchmarks SHARED ${ADDMATHFUNC} ${benchmark_sources})
+
+#TARGET_LINK_LIBRARIES(benchmarks cl m ${OPENGL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+TARGET_LINK_LIBRARIES(benchmarks cl m)
+
+ADD_EXECUTABLE(benchmark_run benchmark_run.cpp)
+TARGET_LINK_LIBRARIES(benchmark_run benchmarks)
diff --git a/benchmark/benchmark_run.cpp b/benchmark/benchmark_run.cpp
new file mode 100644
index 0000000..b29ccc3
--- /dev/null
+++ b/benchmark/benchmark_run.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file utest_run.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * Just run the unit tests. The user can possibly provides the subset of it
+ */
+#include "utest_helper.hpp"
+#include "utest_exception.hpp"
+#include <iostream>
+#include <getopt.h>
+
+static const char *shortopts = "c:lanh";
+struct option longopts[] = {
+{"casename", required_argument, NULL, 'c'},
+{"list", no_argument, NULL, 'l'},
+{"all", no_argument, NULL, 'a'},
+{"allnoissue", no_argument, NULL, 'n'},
+{"help", no_argument, NULL, 'h'},
+{0, 0, 0, 0},
+};
+
+void usage()
+{
+    std::cout << "\
+Usage:\n\
+  ./utest_run <option>\n\
+\n\
+  option:\n\
+    -c <casename>: run sub-case named 'casename'\n\
+    -l           : list all the available case name\n\
+    -a           : run all test cases\n\
+    -n           : run all test cases without known issue (default option)\n\
+    -h           : display this usage\n\
+\
+    "<< std::endl;
+}
+
+int main(int argc, char *argv[])
+{
+
+  int c = 0;
+  cl_ocl_init();
+
+  c = getopt_long (argc, argv, shortopts, longopts, NULL);
+
+  if (argc == 1)
+    c = 'n';
+  if (argc == 2 && c < 1 ){
+    c = 'c';
+    optarg = argv[1];
+  }
+
+  do {
+    switch (c)
+    {
+      case 'c':
+        try {
+          UTest::run(optarg);
+        }
+        catch (Exception e){
+          std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+        }
+
+        break;
+
+      case 'l':
+        UTest::listAllCases();
+        break;
+
+      case 'a':
+        try {
+          UTest::runAll();
+        }
+        catch (Exception e){
+          std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+        }
+
+        break;
+
+      case 'n':
+        try {
+          UTest::runAllNoIssue();
+        }
+        catch (Exception e){
+          std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+        }
+
+        break;
+
+      case 'h':
+      default:
+        usage();
+        exit(1);
+    }
+  } while ((c = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1);
+
+  cl_ocl_destroy();
+}
diff --git a/benchmark/enqueue_copy_buf.cpp b/benchmark/enqueue_copy_buf.cpp
new file mode 100644
index 0000000..0d0d4df
--- /dev/null
+++ b/benchmark/enqueue_copy_buf.cpp
@@ -0,0 +1,69 @@
+#include "utests/utest_helper.hpp"
+#include <sys/time.h>
+
+void test_copy_buf(size_t sz, size_t src_off, size_t dst_off, size_t cb)
+{
+  unsigned int i;
+  cl_char* buf0;
+
+  OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(char), NULL);
+
+  buf0 = (cl_char *)clEnqueueMapBuffer(queue, buf[0], CL_TRUE, CL_MAP_WRITE, 0, sizeof(char), 0, NULL, NULL, NULL);
+
+  for (i=0; i < sz; i++) {
+    buf0[i]=(rand() & 0xFF);
+  }
+
+  clEnqueueUnmapMemObject(queue, buf[0], buf0, 0, NULL, NULL);
+
+  if (src_off + cb > sz || dst_off + cb > sz) {
+  /* Expect Error. */
+    OCL_ASSERT(clEnqueueCopyBuffer(queue, buf[0], buf[1],
+                 src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+    return;
+  }
+
+  OCL_ASSERT(CL_SUCCESS == clEnqueueCopyBuffer(queue, buf[0], buf[1],
+    src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+}
+
+int tim_subtract(struct timeval *y, struct timeval *x, struct timeval *result){
+  if ( x->tv_sec > y->tv_sec )
+    return   -1;
+
+  if ((x->tv_sec == y->tv_sec) && (x->tv_usec > y->tv_usec))
+    return   -1;
+
+  if ( result != NULL){
+    result->tv_sec = ( y->tv_sec - x->tv_sec );
+    result->tv_usec = ( y->tv_usec - x->tv_usec );
+
+    if (result->tv_usec < 0){
+      result->tv_sec --;
+      result->tv_usec += 1000000;
+    }
+  }
+
+  int msec = 1000.0*(y->tv_sec - x->tv_sec) + (y->tv_usec - x->tv_usec)/1000.0;
+  return msec;
+}
+
+
+int enqueue_copy_buf(void)
+{
+  size_t i;
+  const size_t sz = 127 *1023 * 1023;
+  struct timeval start,stop;
+
+  gettimeofday(&start,0);
+
+  for (i=0; i<10; i++) {
+    test_copy_buf(sz, 0, 0, sz);
+  }
+
+  gettimeofday(&stop,0);
+  return tim_subtract(&stop, &start, 0);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(enqueue_copy_buf);
diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
new file mode 100644
index 0000000..7e5b730
--- /dev/null
+++ b/docs/Beignet.mdwn
@@ -0,0 +1,230 @@
+Beignet
+=======
+
+Beignet is an open source implementation of the OpenCL specification - a generic
+compute oriented API. This code base contains the code to run OpenCL programs on
+Intel GPUs which basically defines and implements the OpenCL host functions
+required to initialize the device, create the command queues, the kernels and
+the programs and run them on the GPU. The code base also contains the compiler
+part of the stack which is included in `backend/`. For more specific information
+about the compiler, please refer to `backend/README.md`
+
+News
+----
+[[Beignet project news|Beignet/NEWS]]
+
+Prerequisite
+------------
+
+The project depends on the following external libaries:
+
+- libdrm libraries (libdrm and libdrm\_intel)
+- Various LLVM components
+- If run with X server, beignet needs XLib, Xfixes and Xext installed. Otherwise, no X11 dependency.
+
+And if you want to work with the standard ICD libOpenCL.so, then you need
+two more packages (the following package name is for Ubuntu):
+
+- ocl-icd-dev
+- ocl-icd-libopencl1
+
+If you don't want to enable ICD, or your system doesn't have ICD OpenCL support,
+you can still link to the beignet OpenCL library. You can find the beignet/libcl.so
+in your system's library installation directories.
+
+Note that the compiler depends on LLVM (Low-Level Virtual Machine project).
+Right now, the code has been compiled with LLVM 3.3/3.4. It will not compile
+with anything older.
+
+[http://llvm.org/releases/](http://llvm.org/releases/)
+
+LLVM 3.3 , 3.4 and 3.5 are supported. Till now, the recommended LLVM/CLANG version is 3.5.
+There are some severe OpenCL related regression in clang 3.4 version.
+
+**Note about LLVM 3.4**
+
+* If you want to try Clang/LLVM 3.4, you need to disable terminfo:
+--disable-terminfo. It's a llvm 3.4 bug.
+
+Please be noted that the code was compiled on GCC 4.6, GCC 4.7 and GCC 4.8. Since the code
+uses really recent C++11 features, you may expect problems with older compilers.
+
+How to build and install
+------------------------
+
+The project uses CMake with three profiles:
+
+1. Debug (-g)
+2. RelWithDebInfo (-g with optimizations)
+3. Release (only optimizations)
+
+Basically, from the root directory of the project
+
+`> mkdir build`
+
+`> cd build`
+
+`> cmake ../ # to configure`
+
+CMake will check the dependencies and will complain if it does not find them.
+
+`> make`
+
+The cmake will build the backend firstly. Please refer to:
+[[OpenCL Gen Backend|Beignet/Backend]] to get more dependencies.
+
+Once built, the run-time produces a shared object libcl.so which basically
+directly implements the OpenCL API. A set of tests are also produced. They may
+be found in `utests/`.
+
+Simply invoke:
+`> make install`
+
+It installs the following six files to the beignet/ directory relatively to
+your library installation directory.
+- libcl.so
+- libgbeinterp.so
+- libgbe.so
+- ocl\_stdlib.h, ocl\_stdlib.h.pch
+- beignet.bc
+
+It installs the OCL icd vendor files to /etc/OpenCL/vendors, if the system support ICD.
+- intel-beignet.icd
+
+How to run
+----------
+
+After build and install of beignet, you may need to check whether it works on your
+platform. Beignet also produces various tests to ensure the compiler and the run-time
+consistency. This small test framework uses a simple c++ registration system to
+register all the unit tests.
+
+You need to call setenv.sh in the utests/ directory to set some environment variables
+firstly as below:
+
+`> . setenv.sh`
+
+Then in `utests/`:
+
+`> ./utest_run`
+
+will run all the unit tests one after the others
+
+`> ./utest_run some_unit_test0 some_unit_test1`
+
+will only run `some_unit_test0` and `some_unit_test1` tests
+
+On all supported target platform, the pass rate should be 100%. If it is not, you may
+need to refer the "Known Issues" section.
+
+Supported Targets
+-----------------
+
+ * 3rd Generation Intel Core Processors
+ * Intel “Bay Trail” platforms with Intel HD Graphics
+ * 4th Generation Intel Core Processors, need kernel patch currently, see below
+   for details:
+
+Known Issues
+------------
+
+* GPU hang issues.
+  To check whether GPU hang, you could execute dmesg and check whether it has the following message:
+  `[17909.175965] [drm:i915_hangcheck_hung] *ERROR* Hangcheck timer elapsed...`
+  If it does, there was a GPU hang. Usually, this means something wrong in the kernel, as it indicates
+  the OCL kernel hasn't finished for about 6 seconds or even more. If you think the OCL kernel does need
+  to run that long and have confidence with the kernel, you could disable the linux kernel driver's
+  hang check feature to fix this hang issue. Just invoke the following command on Ubuntu system:
+
+  `# echo -n 0 > /sys/module/i915/parameters/enable_hangcheck`
+
+  But this command is a little bit dangerous, as if your kernel really hang, then the gpu will lock up
+  forever until a reboot.
+
+* Almost all unit tests fail on Linux kernel 3.15/3.16.
+  There is a known issue in some versions of linux kernel which enable register whitelist feature
+  but miss some necessary registers which are required for beignet. The problematic version are
+  around 3.15 and 3.16 which have commit f0a346b... but haven't commit c9224f... If it is the case,
+  you can apply c9224f... manually and rebuild the kernel or just disable the parse command by
+  invoke the following command (use Ubuntu as an example):
+  `# echo 0 > /sys/module/i915/parameters/enable_cmd_parser`
+
+* Some unit test cases, maybe 20 to 30, fail on 4th Generation (HSW) platform.
+  The 4th Generation Intel Core Processors's support requires some Linux kernel
+  modification. You need to apply the patch at:
+  [https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support](https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support)
+
+* Precision issue.
+  Currently Gen does not provide native support of high precision math functions
+  required by OpenCL. We provide a software version to achieve high precision,
+  which you can turn on through `export OCL_STRICT_CONFORMANCE=1`.
+  But be careful, this would make your CL kernel run a little longer.
+
+* cl\_khr\_gl\_sharing.
+  This extension highly depends on mesa support. It seems that mesa would not provide
+  such type of extensions, we may have to hack with mesa source code to support this
+  extension. This feature used to work with a previous mesa git version. But now, it's
+  simply broken.
+
+TODO
+----
+
+In terms of the OpenCL 1.2 spec, beignet is quite complete now. We can pass almost
+all the piglit OpenCL test cases now. And the pass rate for the OpenCV test suite
+is also good which is about 99%. There are still some remains work items listed as below,
+most of them are extension support and performance related.
+
+- Performance tuning. There are some major optimizations need to be done,
+  Peephole optimization, convert to structured BBs and leverage Gen's structured
+  instructions, and optimize the extreme slow software based sin/cos/... math
+  functions due to the native math instruction lack of necessary precision.
+  And all the code is inlined which will increase the icache miss rate
+  significantly. And many other things which are specified partially in
+  [[here|Beignet/Backend/TODO]].
+
+- Complete cl\_khr\_gl\_sharing support. We lack of some APIs implementation such
+  as clCreateFromGLBuffer,clCreateFromGLRenderbuffer,clGetGLObjectInfo... Currently,
+  the working APIs are clCreateFromGLTexture,clCreateFromGLTexture2D. We may need to
+  find a graceful way to co-work with mesa.
+
+- Check that NDRangeKernels can be pushed into _different_ queues from several
+  threads.
+
+- No state tracking at all. One batch buffer is created at each "draw call"
+  (i.e. for each NDRangeKernels). This is really inefficient since some
+  expensive pipe controls are issued for each batch buffer.
+
+- Valgrind reports some leaks in libdrm. It sounds like a false positive but it
+  has to be checked. Idem for LLVM. There is one leak here to check.
+
+More generally, everything in the run-time that triggers the "FATAL" macro means
+that something that must be supported is not implemented properly (either it
+does not comply with the standard or it is just missing)
+
+Project repository
+------------------
+Right now, we host our project on fdo at:
+[http://cgit.freedesktop.org/beignet/](http://cgit.freedesktop.org/beignet/).
+And the intel 01.org:
+[https://01.org/beignet](https://01.org/beignet)
+
+The team
+--------
+Beignet project was created by Ben Segovia. Since 2013, Now we have a team in
+Intel China OTC graphics team continue to work on this project.
+The official contact for this project is: Zou Nanhai (<nanhai.zou at intel.com>).
+
+How to contribute
+-----------------
+You are always welcome to contribute to this project, just need to subscribe
+to the beignet mail list and send patches to it for review.
+The official mail list is as below:
+[http://lists.freedesktop.org/mailman/listinfo/beignet](http://lists.freedesktop.org/mailman/listinfo/beignet)
+
+Documents for OpenCL application developers
+-------------------------------------------
+- [[Cross compile|Beignet/howto/cross-compiler-howto]]
+- [[Kernel Optimization Guide|Beignet/optimization-guide]]
+
+The wiki URL is as below:
+[http://www.freedesktop.org/wiki/Software/Beignet/](http://www.freedesktop.org/wiki/Software/Beignet/)
diff --git a/docs/Beignet/Backend.mdwn b/docs/Beignet/Backend.mdwn
new file mode 100644
index 0000000..319ce81
--- /dev/null
+++ b/docs/Beignet/Backend.mdwn
@@ -0,0 +1,96 @@
+Beignet Compiler
+================
+
+This code base contains the compiler part of the Beignet OpenCL stack. The
+compiler is responsible to take a OpenCL language string and to compile it into
+a binary that can be executed on Intel integrated GPUs.
+
+Limitations
+-----------
+
+Today, the compiler is far from complete. See [[here|Backend/TODO]] for a
+(incomplete) lists of things to do.
+
+Interface with the run-time
+---------------------------
+
+Even if the compiler makes a very liberal use of C++ (templates, variadic
+templates, macros), we really tried hard to make a very simple interface with
+the run-time. The interface is therefore a pure C99 interface and it is defined
+in `src/backend/program.h`.
+
+The goal is to hide the complexity of the inner data structures and to enable
+simple run-time implementation using straightforward C99.
+
+Note that the data structures are fully opaque: this allows us to use both the
+C++ simulator or the real Gen program in a relatively non-intrusive way.
+
+Various environment variables
+-----------------------------
+
+Environment variables are used all over the code. Most important ones are:
+
+- `OCL_STRICT_CONFORMANCE` `(0 or 1)`. Gen does not provide native high
+  precision math instructions compliant with OpenCL Spec. So we provide a
+  software version to meet the high precision requirement. Obviously the
+  software version's performance is not as good as native version supported by
+  GEN hardware. What's more, most graphics application don't need this high
+  precision, so we choose 0 as the default value. So OpenCL apps do not suffer
+  the performance penalty for using high precision math functions.
+
+- `OCL_SIMD_WIDTH` `(8 or 16)`. Select the number of lanes per hardware thread,
+  Normally, you don't need to set it, we will select suitable simd width for
+  a given kernel. Default value is 16.
+
+- `OCL_OUTPUT_GEN_IR` `(0 or 1)`. Output Gen IR (scalar intermediate
+  representation) code
+
+- `OCL_OUTPUT_LLVM` `(0 or 1)`. Output LLVM code after the lowering passes
+
+- `OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS` `(0 or 1)`. Output LLVM code before the
+  lowering passes
+
+- `OCL_OUTPUT_ASM` `(0 or 1)`. Output Gen ISA
+
+- `OCL_OUTPUT_REG_ALLOC` `(0 or 1)`. Output Gen register allocations, including
+  virtual register to physical register mapping, live ranges.
+
+- `OCL_OUTPUT_BUILD_LOG` `(0 or 1)`. Output error messages if there is any
+  during CL kernel compiling and linking.
+
+- `OCL_OUTPUT_CFG` `(0 or 1)`. Output control flow graph in .dot file.
+
+- `OCL_OUTPUT_CFG_ONLY` `(0 or 1)`. Output control flow graph in .dot file,
+  but without instructions in each BasicBlock.
+
+- `OCL_PRE_ALLOC_INSN_SCHEDULE` `(0 or 1)`. The instruction scheduler in
+  beignet are currently splitted into two passes: before and after register
+  allocation. The pre-alloc scheduler tend to decrease register pressure.
+  This variable is used to disable/enable pre-alloc scheduler. This pass is
+  disabled now for some bugs.
+
+- `OCL_POST_ALLOC_INSN_SCHEDULE` `(0 or 1)`. Disable/enable post-alloc
+  instruction scheduler. The post-alloc scheduler tend to reduce instruction
+  latency. By default, this is enabled now.
+
+- `OCL_SIMD16_SPILL_THRESHOLD` `(0 to 256)`. Tune how much registers can be
+  spilled under SIMD16. Default value is 16. We find spill too much register
+  under SIMD16 is not as good as fall back to SIMD8 mode. So we set the
+  variable to control spilled register number under SIMD16.
+
+- `OCL_USE_PCH` `(0 or 1)`. The default value is 1. If it is enabled, we use
+  a pre compiled header file which include all basic ocl headers. This would
+  reduce the compile time.
+
+Implementation details
+----------------------
+
+Several key decisions may use the hardware in an usual way. See the following
+documents for the technical details about the compiler implementation:
+
+- [[Mixed buffer pointer)|mixed_buffer_pointer]] 
+- [[Unstructured branches|unstructured_branches]]
+- [[Scalar intermediate representation|gen_ir]]
+- [[Clean backend implementation|compiler_backend]]
+
+Ben Segovia.
diff --git a/docs/Beignet/Backend/TODO.mdwn b/docs/Beignet/Backend/TODO.mdwn
new file mode 100644
index 0000000..501c508
--- /dev/null
+++ b/docs/Beignet/Backend/TODO.mdwn
@@ -0,0 +1,110 @@
+TODO
+====
+
+The compiler is quite complete now in terms of functionality. It could pass
+almos all of the piglit OCL test cases and the pass rate for the OpenCV test
+suite is also quite good now. But there are plenty of things to do for the
+final performance tuning.
+
+OpenCL standard library
+-----------------------
+
+Today we define the OpenCL API in header file `src/ocl_stdlib.h`.
+
+By the way, one question remains: do we want to implement
+the high-precision functions as _inline_ functions or as external functions to
+call? Indeed, inlining all functions may lead to severe code bloats while
+calling functions will require to implement a proper ABI. We certainly want to
+do both actually.
+
+LLVM front-end
+--------------
+
+The code is defined in `src/llvm`.  We used the SPIR and the OpenCL profile
+to compile the code. Therefore, a good part of the job is already done. However,
+many things must be implemented:
+
+- Better resolving of the PHI functions. Today, we always generate MOV
+  instructions at the end of each basic block . They can be easily optimized.
+
+- From LLVM 3.3, we use SPIR IR. We need to use the compiler defined type to
+  represent sampler\_t/image2d\_t/image1d\_t/....
+
+- Considering to use libclc in our project and avoid to use the PCH which is not
+  compatible for different clang versions. And may contribute what we have done in
+  the ocl\_stdlib.h to libclc if possible.
+
+- Optimize math functions. If the native math instructions don't compy with the
+  OCL spec, we use pure software style to implement those math instructions which
+  is extremely slow, for example. The cos and sin for HD4000 platform are very slow.
+  For some applications which may not need such a high accurate results. We may
+  provide a mechanism to use native\_xxx functions instead of the extremely slow
+  version.
+
+Gen IR
+------
+
+The code is defined in `src/ir`. Main things to do are:
+
+- Convert unstructured BBs to structured format, and leverage Gen's structured
+  instruction such as if/else/endif to encoding those BBs. Then we can save many
+  instructions which are used to maintain software pcips and predications.
+
+- Implement those llvm.memset/llvm.memcpy more efficiently. Currently, we lower
+  them as normal memcpy at llvm module level and not considering the intrinsics
+  all have a constant data length.
+
+- Finishing the handling of function arguments (see the [[IR
+  description|gen_ir]] for more details)
+
+- Merging of independent uniform loads (and samples). This is a major
+  performance improvement once the uniform analysis is done. Basically, several
+  uniform loads may be collapsed into one load if no writes happens in-between.
+  This will obviously impact both instruction selection and the register
+  allocation.
+
+- Implement fast path for small local variables. When the kernel only defines
+  a small local array/variable, there will be a good chance to allocate the local
+  array/variable in register space rather than system memory. This will reduce a
+  lot of memory load/stroe from the system memory.
+
+Backend
+-------
+
+The code is defined in `src/backend`. Main things to do are:
+
+- Optimize register spilling (see the [[compiler backend description|compiler_backend]] for more details)
+
+- Implementing proper instruction selection. A "simple" tree matching algorithm
+  should provide good results for Gen
+
+- Improving the instruction scheduling pass. Need to implement proper pre register
+  allocation scheduling to lower register pressure.
+
+- Reduce the macro instructions in gen\_context. The macro instructions added in
+  gen\_context will not get a chance to do post register allocation scheduling.
+
+- leverage the structured if/endif for branching processing.
+
+- Peephole optimization. There are many chances to do further peephole optimization.
+
+General plumbing
+----------------
+
+I tried to keep the code clean, well, as far as C++ can be really clean. There
+are some header cleaning steps required though, in particular in the backend
+code.
+
+The context used in the IR code generation (see `src/ir/context.*pp`) should be
+split up and cleaned up too.
+
+I also purely and simply copied and pasted the Gen ISA disassembler from Mesa.
+This leads to code duplication. Also some messages used by OpenCL (untyped reads
+and writes) are not properly decoded yet.
+
+All of those code should be improved and cleaned up are tracked with "XXX"
+comments in the code.
+
+Parts of the code leaks memory when exceptions are used. There are some pointers
+to track and replace with std::unique\_ptr. Note that we also add a custom memory
+debugger that nicely complements (i.e. it is fast) Valgrind.
diff --git a/docs/Beignet/Backend/compiler_backend.mdwn b/docs/Beignet/Backend/compiler_backend.mdwn
new file mode 100644
index 0000000..3c489b2
--- /dev/null
+++ b/docs/Beignet/Backend/compiler_backend.mdwn
@@ -0,0 +1,118 @@
+Compiler Back End
+=================
+
+Well, the complete code base is somehow a compiler backend for LLVM. Here, we
+really speak about the final code generation passes that you may find in
+`src/backend`.
+
+As explained in [[the scalar IR presentation|gen_ir]], we bet on a very
+simple scalar IR to make it easy to parse and modify. The idea is to fix the
+unrelated problem (very Gen specific) where we can i.e. when the code is
+generated.
+
+The code generation in the compiler backend is classically divided into four
+steps
+
+- Instruction selection (defined in `src/backend/gen_insn_selection.*pp`). We
+  expose an interface for the instruction selection engine. We implemented a
+  very simple selection (called `SimpleSelection`) that does a quick and dirty
+  one-to-many instruction generation.
+
+- Register allocation (defined in `src/backend/gen_reg_allocation.*pp`). The
+  code implements a linear scan allocator on the code selected in the previous
+  pass. See below for more details about register vector allocations.
+
+- Instruction scheduling. This one is not done yet. We just output the same
+  instruction order as the program order. Note that we plan to implement an
+  adaptive scheduling between register allocation and instruction  selection (to
+  avoid spilling as much as possible)
+
+- Instruction encoding. This is the final step that encodes the program into Gen
+  ISA.
+
+Instruction selection
+---------------------
+
+Usually, the instruction selection consists in mapping `p` instructions to `q`
+ISA instructions under a cost driven model. Each basic block is therefore _tiled_
+into some numbers of groups of ISA instructions such that the final cost is
+minimized.
+
+The literature is particularly dense on the subject. Compilers usually use today
+either tree matching methods or selection DAG techniques (as LLVM backends do)
+
+The instruction selection is still a work in progress in our compiler and we
+only implement the most stupid (and inefficient) technique: we simply generate
+as many instructions as we need for each _individual_ IR instructions. Since we
+do not support immediate sources, this in particular leads to really ugly
+looking code such as `mov (16) r2:f 1.f`. It is still a work in progress.
+
+Other than that, the instruction selection is really a book keeping structure.
+We basically output `SelectionInstruction` objects which are the 1-to-1 mapping
+of Gen ISA encoding functions defined in `src/backend/gen_encoder.*pp`.
+
+However, the `SelectionInstruction` still use unallocated virtual registers and
+do *not* use vectors but simply tuples of virtual registers.
+
+Register allocation
+-------------------
+
+The register allocation actually consists in two steps:
+
+1. Handling the vector for all the instructions that require them
+
+2. Performing the register allocation itself
+
+Step 1 consists in scanning all the vectors required by sends. Obviously, the
+same register may be used in different vectors and that may lead to
+interferences. We simply sort the vectors from the largest to the smallest and
+allocate them in that order. As an optimization we also identify sub-vectors
+i.e. vectors included in larger ones and no not allocate them.
+
+The code may be largely improved in particular if we take into account liveness
+interferences as well. Basically, a register may be part of several vectors if the
+registers that are not in both vectors at the same location are not alive at the
+same time.
+
+This is still a work in progress. Code is right now handled by method
+`GenRegAllocator::allocateVector`.
+
+Step 2 performs the register allocation i.e. it associates each virtual register
+to one (or several) physical registers. The first thing is that the Gen register
+file is very flexible i.e. it can (almost) be freely partitioned. To handle this
+peculiarity, we simply implemented a free list based generic memory allocator as
+done with `RegisterFilePartitioner` in `src/backend/context.cpp`.
+
+We provide two directions of memory allocation. From tail to head direction is
+used for normal register, and from head to tail is for the curbe payload register
+allocation.
+
+We then simply implemented a linear scan allocator (see
+`gen_reg_allocation.cpp`). The spilling is implemented in the same file. The
+heuristics we used is the register's end point. It always try to spill the
+register with largest liveness end point if possible. Although Gen support to
+spill 4 SIMD8 register at once, we only support one currently. Need to optimize
+it latter, at least for the vectors' spilling. Maybe a new pass in the backend
+to find opportunity to gatter more spilled register into one contiguous area
+is also worth to do. We also can consider the spill register's interval to
+do smarter scratch memory allocation to reduce scratch memory requirement.
+
+Instruction scheduling
+----------------------
+
+Intra-basic block instruction scheduling is relatively simple. It is implemented
+but has known bug, we need further effort to fix it.
+
+Instruction encoding
+--------------------
+
+This is mostly done in `src/backend/gen_context.cpp` and
+`src/backend/gen_encoder./*pp`. This is mostly glue code and it is pretty
+straightforward. We just forward the selection code using the physically
+allocated registers. There is nothing special here. Just boilerplate.
+
+There are plenty of huge macro instructions in the `gen_context.cpp` currently.
+Most of them are for the long/double support on a Gen platform which doesn't support
+long/double in the hardware level. We may need to clean up and move those non-hardware
+related functions into upper layer. Too many huge instruction which will totally
+make the register spilling and dead code elimination harder and inefficient.
diff --git a/docs/Beignet/Backend/gen_ir.mdwn b/docs/Beignet/Backend/gen_ir.mdwn
new file mode 100644
index 0000000..635cbb4
--- /dev/null
+++ b/docs/Beignet/Backend/gen_ir.mdwn
@@ -0,0 +1,254 @@
+Scalar Intermediate Representation
+==================================
+
+The IR code is included in `src/ir/` of the compiler code base
+The IR as designed in this compiler is the fruit of a long reflection I mostly
+have with Thomas Raoux. Note I usually call it "Gen IR".
+
+Scalar vs vector IR
+-------------------
+
+This is actually the major question: do we need a vector IR or a scalar IR? On
+the LLVM side, we have both. LLVM IR can manipulate vectors and scalars (and
+even generalized values but we can ignore it for now).
+
+For that reason, the Clang front-end generates both scalar and vector code.
+Typically, a `uint4` variable will output a vector of 4 integers. Arithmetic
+computations will be directly done on vector variables.
+
+One the HW side, the situation is completely different:
+
+- We are going to use the parallel mode (align1) i.e. the struct-of-array mode
+  for the EU. This is a SIMD scalar mode.
+
+- The only source of vectors we are going to have is on the sends instructions
+  (and marginally for some other instructions like the div\_rem math instruction)
+
+One may therefore argue that we need vector instructions to handle the sends.
+Send will indeed require both vector destinations and sources. This may be a
+strong argument *for* vectors in the IR. However, the situation is not that
+good.
+
+Indeed, if we look carefully at the send instructions we see that they will
+require vectors that are *not* vectors in LLVM IR. This code for example:
+
+<code>
+\_\_global uint4 \*src;<br/>
+uint4 x = src[get\_global\_id(0)];<br/>
+</code>
+
+will be translated into an untyped write in the Gen ISA. Unfortunately, the
+address and the values to write are in the *same* vector. However, LLVM IR will
+output a store like:
+
+`store(%addr, %value)`
+
+which basically uses one scalar (the address) and one value (the vector to
+write). Therefore even if we handle vectors in the IR, that will not directly
+solve the problem we have at the end for the send instructions.
+
+We therefore decided to go the other direction:
+
+- We have a purely scalar IR
+
+- To replace vectors, we simply use multiple sources and destinations
+
+- Real vectors required by send instructions are handled at the very bottom of
+the stack in the register allocation passes.
+
+This leads to a very simple intermediate representation which is mostly a pure
+scalar RISC machine.
+
+Very limited IR
+---------------
+
+The other major question, in particular when you look similar stacks like NVidia
+SPIR, is:
+
+do we need to encode in the IR register modifiers (abs, negate...) and immediate
+registers (like in add.f x y 1.0)?
+
+Contrary to other IRs (SPIR and even LLVM that both supports immediates), we also
+chose to have a very simply IR, much simpler than the final ISA, and to merge
+back what we need at the instruction selection pass. Since we need instruction
+selection, let us keep the IR simple.
+
+Also, there are a lot of major issues that can not be covered in the IR and
+require to be specifically handled at the very end of the code:
+
+- send vectors (see previous section)
+
+- send headers (value and register allocation) which are also part of the vector
+problem
+
+- SIMD8 mode in SIMD16 code. Some send messages do not support SIMD16 encoding
+and require SIMD8. Typically examples are typed writes i.e. scatters to textures.
+Also, this cannot be encoded in some way in a regular scalar IR.
+
+For these reasons, most of the problems directly related to Gen naturally find
+their solutions in either the instruction selection or the register allocator.
+
+This leads to the following strategy:
+
+- Keep the IR very simple and limited
+
+- Use all the analysis tools you need in the IR before the final code generation
+to build any information you need. This is pure "book-keeping".
+
+- Use any previous analysis and finish the job at the very end
+
+This classical approach leads to limit the complexity in the IR while forcing us
+to write the proper tools in the final stages.
+
+Why not using LLVM IR directly?
+-------------------------------
+
+We hesitated a long time between writing a dedicated IR (as we did) and just
+using LLVM IR. Indeed, LLVM comes with a large set of tools that are parts of
+"LLVM backends". LLVM provides a lot of tools to perform the instruction
+selection (`SelectionDAG`) and the register allocation. Two things however
+prevent us from choosing this path:
+
+- We only have a limited experience with LLVM and no experience at all with the
+LLVM backends
+
+- LLVM register allocators do not handle at all the peculiarities of Gen:
+
+  * flexible register file. Gen registers are more like memory than registers
+    and can be freely allocated and aliased. LLVM register allocators only
+    support partial aliasing like x86 machines do (rax -> eax -> ax)
+
+  * no proper tools to handle vectors in the register allocator as we need for
+    sends
+
+Since we will need to do some significant work anyway, this leads us to choose a
+more hard-coded path with a in-house IR. Note that will not prevent us from
+implementing later a LLVM backend "by the book" as Nvidia does today with SPIR
+(using a LLVM backend to do the LLVM IR -> SPIR conversion)
+
+
+SSA or no SSA
+-------------
+
+Since we have a purely scalar IR, implementing a SSA transformation on the IR
+may be convenient. However, most the literature about compiler back-ends use
+non-SSA representation of the code. Since the primary goal is to write a
+compiler _back-end_ (instruction selection, register allocation and instruction
+scheduling), we keep the code in non-SSA letting the higher level optimizations
+to LLVM.
+
+Types, registers, instructions, functions and units
+---------------------------------------------------
+
+The IR is organized as follows:
+
+- Types (defined in `src/ir/type.*pp`). These are scalar types only. Since the
+  code is completely lowered down, there is no more reference to structures,
+  pointers or vectors. Everything is scalar values and when "vectors" or
+  "structures" would be needed, we use instead multiple scalar sources or
+  destinations.
+
+- Registers (defined in `src/ir/register.*pp`). They are untyped (since Gen IR
+  are untyped) and we have 65,535 of them per function
+
+- Instructions (defined in `src/ir/instruction.*pp`). They are typed (to
+  distinguish integer and FP adds for example) and possibly support multiple
+  destinations and sources. We also provide a convenient framework to introspect
+  the instruction in a simple (and memory efficient) way
+
+- Functions (defined in `src/ir/function.*pp`). They are basically the counter
+  part of LLVM functions or OpenCL kernels. Note that function arguments are a
+  problem. We actually use the SPIR ABI. Everything smaller than the machine word
+  size (i.e. 32 bits for Gen) is passed by value with a register. Everything
+  else which is bigger than is passed by pointer with a ByVal attribute.
+  Note that requires some special treatment in the IR (see below) to make the
+  code faster by replacing function argument loads by  "pushed constants". We
+  also defined one "register file" per function i.e. the registers are defined
+  relatively to the function that uses them. Each function is made of basic
+  blocks i.e. sequence of instructions that are executed linearly.
+
+- Units (defined in `src/ir/unit.*pp`). Units are just a collection of
+  functions and constants (not supported yet).
+
+Function arguments and pushed constants
+---------------------------------------
+
+Gen can push values into the register file i.e. some registers are preset when
+the kernel starts to run. As detailed previously, the SPIR ABI is convenient
+since every argument is either one register or one pointer to load from or to
+store to.
+
+However, when a pointer is used for an argument, loads are issued which may be
+avoided by using constant pushes.
+
+Once again OCL makes the task a bit harder than expected. Indeed, the C
+semantic once again applies to function arguments as well.
+
+Look at these three examples:
+
+### Case 1. Direct loads -> constant push can be used
+
+<code>
+struct foo { int x; int y; }; </br>
+\_\_kernel void case1(\_\_global int \*dst, struct foo bar) </br>
+{<br/>
+  dst[get\_global\_id(0)] = bar.x + bar.y;<br/>
+}
+</code>
+
+We use a _direct_ _load_ for `bar` with `bar.x` and `bar.y`. Values can be
+pushed into registers and we can replace the loads by register reads.
+
+### Case 2. Indirect loads -> we need to load the values from memory
+
+<code>
+struct foo { int x[16]; }; </br>
+\_\_kernel void case1(\_\_global int \*dst, struct foo bar) </br>
+{<br/>
+  dst[get\_global\_id(0)] = bar.x[get\_local\_id(0)];<br/>
+}
+</code>
+
+We use an indirect load with `bar.x[get\_local\_id(0)]`. Here we need to issue a
+load from memory (well, actually, we could do a gather from registers, but it is
+not supported yet).
+
+### Case 3. Writes to arguments -> we need to spill the values to memory first
+
+<code>
+struct foo { int x[16]; }; </br>
+\_\_kernel void case1(\_\_global int \*dst, struct foo bar) </br>
+{<br/>
+bar.x[0] = get\_global\_id(1);<br/>
+  dst[get\_global\_id(0)] = bar.x[get\_local\_id(0)];<br/>
+}
+</code>
+
+Here the values are written before being read. This causes some troubles since
+we are running in SIMD mode. Indeed, we only have in memory *one* instance of
+the function arguments. Here, *many* SIMD lanes and actually *many* hardware
+threads are running at the same time. This means that we can not write the data
+to memory. We need to allocate a private area for each SIMD lane.
+
+In that case, we need to spill back the function arguments into memory. We spill
+once per SIMD lane. Then, we read from this private area rather than the
+function arguments directly.
+
+This analysis is partially done today in `src/ir/lowering.*pp`. We identify all
+the cases but only the case with constant pushing is fully implemented.
+Actually, the two last cases are easy to implement but this requires one or two
+days of work.
+
+Value and liveness analysis tools
+---------------------------------
+
+You may also notice that we provide a complete framework for value analysis
+(i.e. to figure when a value or instruction destination is used and where the
+instruction sources come from). The code is in `src/ir/value.*pp`. Well, today,
+this code will burn a crazy amount of memory (use of std::set all over the
+place) but it at least provides the analysis required by many other passes.
+Compacting the data structures and using O(n) algorithms instead of the O(ln(n))
+are in the TODO list for sure :-)
+
+Finally, we also provide a liveness analysis tool which simply figures out which
+registers are alive at the end of each block (classically "live out" sets).
diff --git a/docs/Beignet/Backend/mixed_buffer_pointer.mdwn b/docs/Beignet/Backend/mixed_buffer_pointer.mdwn
new file mode 100644
index 0000000..f43ab7e
--- /dev/null
+++ b/docs/Beignet/Backend/mixed_buffer_pointer.mdwn
@@ -0,0 +1,46 @@
+Mixed Buffer Pointer
+--------------------
+
+Segmented address space...
+--------------------------
+
+The first challenge with OpenCL is its very liberal use of pointers. The memory
+is segment into several address spaces:
+
+- private. This is the memory for each work item
+
+- global. These are buffers in memory shared by all work items and work groups
+
+- constant. These are constant buffers in memory shared by all work items and
+work groups as well
+
+- local. These is a memory shared by all work items in the *same* work group
+
+... But with no restriction inside each address space
+-----------------------------------------------------
+
+The challenge is that there is no restriction in OpenCL inside each address
+space i.e. the full C semantic applies in particular regarding pointer
+arithmetic.
+
+Therefore the following code is valid:
+
+<code>
+\_\_kernel void example(\_\_global int *dst, \_\_global int *src0, \_\_global int *src1)<br/>
+{<br/>
+  \_\_global int *from;<br/>
+  if (get\_global\_id(0) % 2)<br/>
+    from = src0;<br/>
+  else<br/>
+    from = src1;<br/>
+  dst[get\_global\_id(0)] = from[get\_global\_id(0)];<br/>
+}
+</code>
+
+As one may see, the load done in the last line actually mixes pointers from both
+source src0 and src1. This typically makes the use of binding table indices
+pretty hard. In we use binding table 0 for dst, 1 for src0 and 2 for src1 (for
+example), we are not able to express the load in the last line with one send
+only. The pointer "from" in the last line is so called a mixed buffer pointer.
+
+(To be updated)
diff --git a/docs/Beignet/Backend/unstructured_branches.mdwn b/docs/Beignet/Backend/unstructured_branches.mdwn
new file mode 100644
index 0000000..37a294c
--- /dev/null
+++ b/docs/Beignet/Backend/unstructured_branches.mdwn
@@ -0,0 +1,271 @@
+Unstructured Branches
+=====================
+
+A major challenge in making a OpenCL compiler is certainly to handle any kind of
+branches. Indeed LLVM does not make any distinction between structured branches.
+See [here](http://llvm.org/docs/LangRef.html) for a complete description of
+the LLVM assembly specification.
+
+The C branching code is simply lowered down in the following instructions:
+
+- `ret` to return from the current function
+- `br` that, if predicated, possibly jumps to two destinations (one for the
+   taken branch and one for the other).
+- `switch` that implements the C switch/case construct.
+- `indirectbr` that implements a jump table
+- `invoke` and `resume` mostly used to handle exceptions
+
+Exceptions and jump tables are not supported in OpenCL. Switch cases can be
+lowered down to a sequence of if/else statements (using a divide and conquer
+approach a switch/case can be dispatched in log(n) complexity where n is the
+number of targets).
+
+This leads us to properly implement `br` and `ret` instructions.
+
+Solution 1 - Using Gen structured branches
+------------------------------------------
+
+Gen structured branches are the following instructions:
+
+`if` `else` `endif` `break` `continue` `while` `brd` `brc`
+
+Transforming the LLVM IR code into structured code results in basically
+reverse-engineering the LLVM code into the original C code.
+Unfortunately, there are several key problems:
+
+- OpenCL supports `goto` keyword that may jump to an arbitrary location
+- LLVM can transform the control flow graph in any kind of form
+- Worse is that a reducible control flow graph can be turned into an irreducible
+one by the optimizer.
+
+This can lead to complicated code transform and basic block duplication. The
+specification allows the compiler to abort if an irreducible control flow is
+detected but as an implementor, this is quite awkward to abort the compilation
+because the optimizer turns an reducible CFG to an irreducible one. Using
+structured branches is the open door to many corner cases.
+
+Thing is it exists a pretty elegant solution that can be almost seamlessly
+supported by Gen. This is the solution we retained.
+
+Solution 2 - Linearizing the control flow graph
+-----------------------------------------------
+
+The general problem is to map a general control flow graph to a SIMD machine.
+The problem is fairly well understood today. A recent research paper actually
+dedicated to OpenCL like languages which use the "SPMD" (single program multiple
+data) programming model present interesting insights about how to map SIMD
+architectures to such languages (see [here]
+(http://www.cdl.uni-saarland.de/papers/karrenberg_opencl.pdf)).
+
+### Core idea
+
+- Linearizing the CFG initially consists in removing all forward branches and
+"replace" them by predication. Indeed, the program will be still correct if you
+predicate instructions based instead of forward jumps. This is basically the
+a control flow to data flow conversion.
+
+- Of course, removing all forward branches is inefficient. To improve that, we
+simply introduce "if conditions" in the head of basic blocks to know if we run
+the basic block. If no lanes is going to be activated in the basic block, we
+jump to another basic block where _potentially_ some lanes are going to be
+reactivated.
+
+Consider the following CFG:
+
+<pre>
+o-------o
+|       |
+|   1   |---->-----o
+|       |          |
+o-------o          |
+    |              |
+    |              |
+o-------o          |
+|       |          |
+|   2   |---->-----------o
+|       |          |     |
+o-------o          |     |
+    |              |     |
+    |              |     |
+    | o------o     |     |
+    | |      |     |     |
+    | v      |     |     |
+o-------o    |     |     |
+|       |    |     |     |
+|   3   |    |     |     |
+|       |    |     |     |
+o-------o    |     |     |
+    | |      |     |     |
+    | o------o     |     |
+    |              |     |
+o-------o          |     |
+|       |          |     |
+|   4   |<---------o     |
+|       |                |
+o-------o                |
+    |                    |
+    |                    |
+o-------o                |
+|       |                |
+|   5   |<----------------o
+|       |
+o-------o
+</pre>
+
+Mapping it to a SIMD machine may seem challenging. Actually it is not too
+complicated. The problem is with the 2->5 jump. Indeed, we have to be sure that
+we are not missing any computation done in block 4.
+
+To do so:
+- Instead of jumping from block 2 to block 5, we jump from block 2 to block 4.
+- We implement a `JOIN` point on top of block 4. We check if any lane is going
+to be reactivated for the block 4. If not, we jump to block 5.
+
+This leads to the following linearized CFG:
+<pre>
+o-------o
+|       |
+|   1   |---->-----o
+|       |          |
+o-------o          |
+    |              |
+    |              |
+o-------o          |
+|       |          |
+|   2   |---->-----------o
+|       |          |     |
+o-------o          |     |
+    |              |     |
+    |              |     |
+    | o--<---o     |     |
+    | |      |     |     |
+    | v      |     |     |
+o-------o    |     |     |
+|       |    |     |     |
+|   3   |    ^     |     |
+|       |    |     |     |
+o-------o    |     |     |
+    | |      |     |     |
+    | o-->---o     |     |
+    |              |     |
+o-------o          |     |
+|       |==========|=====|====O
+|   4   |<---------|-----o    |
+|       |<---------o          |
+o-------o                     |
+    |                         |
+    |                         |
+o-------o                     |
+|       |                     |
+|   5   |<====================O
+|       |
+o-------o
+</pre>
+
+There is a new jump from block 4 to block 5.
+
+### Implementation on Gen
+
+When using structured branches, Gen can supports auto-masking i.e. based on the
+branches which are taken, the control flow is properly handled and masks are
+automatically applied on all instructions.
+
+However, there is no similar support for unstructured branches. We therefore
+decided to mask instructions manually and use single program flow. This is
+actually quite easy to do since Gen is able to predicate any branches.
+
+Now, how to evaluate the if conditions in an efficient way?
+
+The choice we did is to use *per-lane block IPs*: for each SIMD lane, we store a
+short (16 bits) for each lane in a regular 256 bits GPR (general purpose
+register). This "blockIP" register is used in the following way:
+
+At the beginning of each block, we compare the blockIP register with the ID of
+the block. The lane is going to be _activated_ if its blockIP is _smaller_ than
+the ID of the block. Otherwise, the lane is deactivated.
+
+Therefore, we build a flag register at the entry of each basic block with a
+single 16-wide uint16_t compare. If no lane is activated, a jump is performed to
+the next block where some lanes is going to be activated.
+
+Since this is regular jumps, we just use `jmpi` instruction. With the help of
+predication, we can express all the different possibilities:
+
+- backward branches are always taken if _any_ of lanes in the predicate is true.
+We just use `<+f0.0.anyh>` predication.
+- forward branches is *not* taken if some of the lanes are going to activated in
+the next block. We therefore compare the blockIP with the ID of the _next_
+block. If all of them are strictly greater than the ID of the next block, we
+jump. We therefore use the `<+f0.0.allh>` predicate in that case.
+- `JOIN` points are even simpler. We simply jump if none of the lane is activated.
+We therefore use the `<-f0.0.anyh>` predicate.
+
+The complete encoding is done in `src/backend/gen_insn_selection.cpp`. Forward
+branches are handled by `SimpleSelection::emitForwardBranch`. Backward branches
+are handled by `SimpleSelection::emitBackwardBranch`. Finally, since `JOIN` points
+are at the top of each basic blocks, they are handled by
+`SimpleSelection::emitLabelInstruction`.
+
+### Computing `JOIN` points
+
+The last problem is to compute `JOIN` point i.e. we need to know if we need to
+jump at the beginning of each block and if we do, what is the target of the
+branch. The code is relatively straightforward and can be found in
+`src/backend/context.cpp`. Function is `Context::buildJIPs`.
+</br>
+Actually, the current implementation is not that elegant. A colleague, Thomas
+Raoux, has a simpler and better idea to handle it.
+
+### Advantages and drawbacks of the method
+
+- The method has one decisive advantage: it is simple and extremely robust. It can
+handle any kind of CFGs (reducible or not) and does not require any
+transformation. The use of shorts is also not random. 16-wide compares is issued
+in 2 cycles (so it is twice fast as 16-wide 32 bits compares).
+- Main drawback will be performance. Even if this is not so bad, we still need
+more instructions than if we used structured branches. Mostly
+  * one or two instructions for `JOIN` points
+  * three instructions for backward and forward jumps (two more than structured
+    branches that just require the branch instruction itself)
+
+Note that all extra instructions are 16 bits instructions (i.e. they use shorts)
+so they will only cost 2 cycles anyway.
+
+The last point is that Gen encoding restricts conditional modifiers and
+predicates to be the same in the instruction. This requires to copy or recompute
+the flag register for compares and select. So one more instruction is required
+for these two instructions. Once again, this would require only 2 cycles.
+
+Remarks on `ret` instructions
+-----------------------------
+
+Since we can handle any kind of CFG, handling the return statements are
+relatively straightforward. We first create one return block at the end of the
+program. Then we replace all other returns by a unconditional jump to this
+block. The CFG linearization will take care of the rest.
+We then simply encode the (only one) return instruction as a End-Of-Thread
+message (EOT).
+Code examples
+-------------
+
+Some tests were written to assert the correctness of the CFG linearization and the
+code generation. They can be found in the _run-time_ code base here:
+
+`utest/compiler_if_else.cpp`
+
+`utest/compiler_lower_return0.cpp`
+
+`utest/compiler_lower_return1.cpp`
+
+`utest/compiler_lower_return2.cpp`
+
+`utest/compiler_short_scatter.cpp`
+
+`utest/compiler_unstructured_branch0.cpp`
+
+`utest/compiler_unstructured_branch1.cpp`
+
+`utest/compiler_unstructured_branch2.cpp`
+
+`utest/compiler_unstructured_branch3.cpp`
+
diff --git a/docs/NEWS.mdwn b/docs/NEWS.mdwn
new file mode 100644
index 0000000..1adb48a
--- /dev/null
+++ b/docs/NEWS.mdwn
@@ -0,0 +1,16 @@
+# News
+
+## Sep 15, 2014
+[Beignet 0.9.3](https://01.org/zh/beignet/downloads/beignet-0.9.3-2014-09-15) is released. This is a bug-fix release.
+
+## July 17, 2014
+[Beignet 0.9.2](https://01.org/zh/beignet/downloads/beignet-0.9.2-2014-07-17) is released. This is a bug-fix release.
+
+## July 4, 2014
+[Beignet 0.9.1](https://01.org/zh/beignet/downloads/beignet-0.9.1-2014-07-04) is released. This is a bug-fix release.
+
+## June 26, 2014
+[Beignet 0.9.0](https://01.org/zh/beignet/downloads/beignet-0.9-2014-06-26) is released. This is a major release. Please see the release notes for more information.
+
+## Feb 12, 2014
+[Beignet 0.8.0](https://01.org/zh/beignet/downloads/2014/beignet-0.8.0-2014-02-12) is released. This is a major release. Please see the release notes for more information.
diff --git a/docs/howto/cross-compiler-howto.mdwn b/docs/howto/cross-compiler-howto.mdwn
new file mode 100644
index 0000000..535cd9a
--- /dev/null
+++ b/docs/howto/cross-compiler-howto.mdwn
@@ -0,0 +1,60 @@
+Cross Compiler HowTo
+====================
+
+Beignet supports both PC devices with full profile and embedded/handheld
+devices with embeded profile. This document describes how to build Beignet
+and OpenCL kernels for a target machine (embedded/handheld devices) in a
+host machine with the help of cross compiler, and also the large-size-reduced
+Beignet driver package for the target machine.
+
+Build Beignet with a cross compiler
+-----------------------------------
+
+Besides the general cross compile methods, reference the following options when
+configure Beignet with cmake.
+
+- LLVM_INSTALL_DIR
+  Beignet depends on llvm+clang, this option refers to the path of llvm-config,
+  llvm-as, llvm-link and clang in the cross compiler environment.
+
+- CMAKE_SKIP_RPATH
+  Some cross compiler systems forbid the usage of rpath in binaries/libraries,
+  set this option to be TRUE.
+
+- GEN_PCI_ID
+  It is the GPU pci_id of the target machine, for example, 0x0162 is the pciid
+  of Intel Ivybridge GPU, and 0x0f31 is Intel Baytrail GPU. The information can
+  be queried with command 'lspci -n'.
+
+- CMAKE_INSTALL_PREFIX
+  This option controls the prefix of installation path.
+
+Distribution of large-size-reduced Beignet driver package
+---------------------------------------------------------
+
+On embedded/handheld devices, storage and memory are scarce, it is necessary to
+provide only the OpenCL runtime library without OpenCL compiler, and only the
+executable binary kernel is supported on such devices.
+
+It means that just distribute libcl.so and libgbeinterp.so (~320k in total after strip)
+are enough for OpenCL embeded profile in the target machine.
+
+Build OpenCL kernels with OpenCL offline compiler
+-------------------------------------------------
+
+Since the target machine does not contain the OpenCL compiler, the OpenCL source
+kernel need to be compiled with an OpenCL offline compiler (gbe_bin_generater)
+into binary kernel in the host machine, and the OpenCL application can load the
+binary kernel with function clCreateProgramWithBinary.
+
+The OpenCL offline compiler gbe_bin_generater is the result of Beignet build and
+locates at .../your_path_to_build/backend/src/gbe_bin_generater, see below for the
+command options.
+
+gbe_bin_generater INFILE [-pbuild_parameter] -oOUTFILE -tGEN_PCI_ID
+
+For example, the following command builds OpenCL source kernel from file 'mykernel.cl'
+for Ivybridge with pci_id 0x0162, and write the result (executable binary kernel)
+into file 'mykernel.bin'.
+
+gbe_bin_generater mykernel.cl -omykernel.bin -t0x0162
diff --git a/docs/optimization-guide.mdwn b/docs/optimization-guide.mdwn
new file mode 100644
index 0000000..8fb29a6
--- /dev/null
+++ b/docs/optimization-guide.mdwn
@@ -0,0 +1,28 @@
+Optimization Guide
+====================
+
+All the SIMD optimization principle also apply to Beignet optimization.  
+Furthermore, there are some special tips for Beignet optimization.
+
+1. It is recommended to choose multiple of 16 work group size. Too much SLM usage may reduce parallelism at group level. 
+   If kernel uses large amount SLM, it's better to choose large work group size. Please refer the following table for recommendations
+   with some SLM usage.  
+| Amount of SLM | 0  | 4K | 8K  | 16K | 32K |  
+| WorkGroup size| 16 | 64 | 128 | 256 | 512 |
+
+2. GEN7's read/write on global memory with DWORD and DWORD4 are significantly faster than read/write on BYTE/WORD.  
+   Use DWORD or DWORD4 to access data in global memory if possible. If you cannot avoid the byte/word access, try to do it on SLM.
+
+3. Use float data type as much as possible.
+
+4. Avoid using long. GEN7's performance for long integer is poor.
+
+5. If there is a small constant buffer, define it in the kernel instead of using the constant buffer argument if possible.  
+   The compiler may optimize it if the buffer is defined inside kernel.
+
+6. Avoid unnecessary synchronizations, both in the runtime and in the kernel.  For examples, clFinish and clWaitForEvents in runtime  
+   and barrier() in the kernel.
+
+7. Consider native version of math built-ins, such as native\_sin, native\_cos, if your kernel is not precision sensitive.
+
+8. Try to eliminate branching as much as possible. For example using min, max, clamp or select built-ins instead of if/else if possible.
diff --git a/include/CL/cl.h b/include/CL/cl.h
new file mode 100644
index 0000000..316565d
--- /dev/null
+++ b/include/CL/cl.h
@@ -0,0 +1,1214 @@
+/*******************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl_platform.h>
+#else
+#include <CL/cl_platform.h>
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_command_queue_properties;
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+typedef cl_bitfield         cl_mem_migration_flags;
+typedef cl_uint             cl_image_info;
+typedef cl_uint             cl_buffer_create_type;
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+typedef cl_uint             cl_program_binary_type;
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+typedef cl_uint             cl_kernel_work_group_info;
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+
+
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+    cl_mem                  buffer;
+} cl_image_desc;
+
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#define CL_INVALID_PROPERTY                         -64
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+
+/* OpenCL Version */
+#define CL_VERSION_1_0                              1
+#define CL_VERSION_1_1                              1
+#define CL_VERSION_1_2                              1
+
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE                              0x1000
+#define CL_DEVICE_VENDOR_ID                         0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
+#define CL_DEVICE_ADDRESS_BITS                      0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
+#define CL_DEVICE_MAX_SAMPLERS                      0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
+#define CL_DEVICE_AVAILABLE                         0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
+#define CL_DEVICE_NAME                              0x102B
+#define CL_DEVICE_VENDOR                            0x102C
+#define CL_DRIVER_VERSION                           0x102D
+#define CL_DEVICE_PROFILE                           0x102E
+#define CL_DEVICE_VERSION                           0x102F
+#define CL_DEVICE_EXTENSIONS                        0x1030
+#define CL_DEVICE_PLATFORM                          0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
+#define CL_DEVICE_LINKER_AVAILABLE                  0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
+#define CL_DEVICE_PARENT_DEVICE                     0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES              0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
+#define CL_DEVICE_PARTITION_TYPE                    0x1046
+#define CL_DEVICE_REFERENCE_COUNT                   0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
+
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+
+/* cl_context_info  */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+    
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
+    
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+
+/* cl_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+/* reserved                                         (1 << 6)    */
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
+
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+#define CL_DEPTH                                    0x10BD
+#define CL_DEPTH_STENCIL                            0x10BE
+
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+#define CL_UNORM_INT24                              0x10DF
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+#define CL_IMAGE_ARRAY_SIZE                         0x1117
+#define CL_IMAGE_BUFFER                             0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
+#define CL_IMAGE_NUM_SAMPLES                        0x111A
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+    
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
+#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
+#define CL_KERNEL_ARG_NAME                          0x119A
+
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
+
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
+    
+/* cl_kernel_arg_type_qualifer */
+#define CL_KERNEL_ARG_TYPE_NONE                     0
+#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+
+/* cl_event_info  */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
+#define CL_EVENT_CONTEXT                            0x11D4
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+#define CL_COMMAND_BARRIER                          0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
+#define CL_COMMAND_FILL_BUFFER                      0x1207
+#define CL_COMMAND_FILL_IMAGE                       0x1208
+
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+
+/* cl_buffer_create_type  */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+
+/* cl_profiling_info  */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          /* num_entries */,
+                 cl_platform_id * /* platforms */,
+                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL 
+clGetPlatformInfo(cl_platform_id   /* platform */, 
+                  cl_platform_info /* param_name */,
+                  size_t           /* param_value_size */, 
+                  void *           /* param_value */,
+                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   /* platform */,
+               cl_device_type   /* device_type */, 
+               cl_uint          /* num_entries */, 
+               cl_device_id *   /* devices */, 
+               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    /* device */,
+                cl_device_info  /* param_name */, 
+                size_t          /* param_value_size */, 
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id                         /* in_device */,
+                   const cl_device_partition_property * /* properties */,
+                   cl_uint                              /* num_devices */,
+                   cl_device_id *                       /* out_devices */,
+                   cl_uint *                            /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
+    
+/* Context APIs  */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * /* properties */,
+                cl_uint                 /* num_devices */,
+                const cl_device_id *    /* devices */,
+                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+                void *                  /* user_data */,
+                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * /* properties */,
+                        cl_device_type          /* device_type */,
+                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
+                        void *                  /* user_data */,
+                        cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         /* context */, 
+                 cl_context_info    /* param_name */, 
+                 size_t             /* param_value_size */, 
+                 void *             /* param_value */, 
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     /* context */, 
+                     cl_device_id                   /* device */, 
+                     cl_command_queue_properties    /* properties */,
+                     cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
+                      cl_command_queue_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   /* context */,
+               cl_mem_flags /* flags */,
+               size_t       /* size */,
+               void *       /* host_ptr */,
+               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem                   /* buffer */,
+                  cl_mem_flags             /* flags */,
+                  cl_buffer_create_type    /* buffer_create_type */,
+                  const void *             /* buffer_create_info */,
+                  cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context              /* context */,
+              cl_mem_flags            /* flags */,
+              const cl_image_format * /* image_format */,
+              const cl_image_desc *   /* image_desc */, 
+              void *                  /* host_ptr */,
+              cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+                        
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           /* context */,
+                           cl_mem_flags         /* flags */,
+                           cl_mem_object_type   /* image_type */,
+                           cl_uint              /* num_entries */,
+                           cl_image_format *    /* image_formats */,
+                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+                                    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           /* memobj */,
+                   cl_mem_info      /* param_name */, 
+                   size_t           /* param_value_size */,
+                   void *           /* param_value */,
+                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           /* image */,
+               cl_image_info    /* param_name */, 
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(  cl_mem /* memobj */, 
+                                    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                    void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;  
+
+/* Sampler APIs */
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSampler(cl_context          /* context */,
+                cl_bool             /* normalized_coords */, 
+                cl_addressing_mode  /* addressing_mode */, 
+                cl_filter_mode      /* filter_mode */,
+                cl_int *            /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         /* sampler */,
+                 cl_sampler_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+/* Program Object APIs  */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        /* context */,
+                          cl_uint           /* count */,
+                          const char **     /* strings */,
+                          const size_t *    /* lengths */,
+                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     /* context */,
+                          cl_uint                        /* num_devices */,
+                          const cl_device_id *           /* device_list */,
+                          const size_t *                 /* lengths */,
+                          const unsigned char **         /* binaries */,
+                          cl_int *                       /* binary_status */,
+                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context            /* context */,
+                                  cl_uint               /* num_devices */,
+                                  const cl_device_id *  /* device_list */,
+                                  const char *          /* kernel_names */,
+                                  cl_int *              /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           /* program */,
+               cl_uint              /* num_devices */,
+               const cl_device_id * /* device_list */,
+               const char *         /* options */, 
+               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program           /* program */,
+                 cl_uint              /* num_devices */,
+                 const cl_device_id * /* device_list */,
+                 const char *         /* options */, 
+                 cl_uint              /* num_input_headers */,
+                 const cl_program *   /* input_headers */,
+                 const char **        /* header_include_names */,
+                 void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+                 void *               /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context           /* context */,
+              cl_uint              /* num_devices */,
+              const cl_device_id * /* device_list */,
+              const char *         /* options */, 
+              cl_uint              /* num_input_programs */,
+              const cl_program *   /* input_programs */,
+              void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+              void *               /* user_data */,
+              cl_int *             /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         /* program */,
+                 cl_program_info    /* param_name */,
+                 size_t             /* param_value_size */,
+                 void *             /* param_value */,
+                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            /* program */,
+                      cl_device_id          /* device */,
+                      cl_program_build_info /* param_name */,
+                      size_t                /* param_value_size */,
+                      void *                /* param_value */,
+                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      /* program */,
+               const char *    /* kernel_name */,
+               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     /* program */,
+                         cl_uint        /* num_kernels */,
+                         cl_kernel *    /* kernels */,
+                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    /* kernel */,
+               cl_uint      /* arg_index */,
+               size_t       /* arg_size */,
+               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       /* kernel */,
+                cl_kernel_info  /* param_name */,
+                size_t          /* param_value_size */,
+                void *          /* param_value */,
+                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel       /* kernel */,
+                   cl_uint         /* arg_indx */,
+                   cl_kernel_arg_info  /* param_name */,
+                   size_t          /* param_value_size */,
+                   void *          /* param_value */,
+                   size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
+                         cl_device_id               /* device */,
+                         cl_kernel_work_group_info  /* param_name */,
+                         size_t                     /* param_value_size */,
+                         void *                     /* param_value */,
+                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             /* num_events */,
+                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         /* event */,
+               cl_event_info    /* param_name */,
+               size_t           /* param_value_size */,
+               void *           /* param_value */,
+               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context    /* context */,
+                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;               
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event   /* event */,
+                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+                     
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback( cl_event    /* event */,
+                    cl_int      /* command_exec_callback_type */,
+                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            /* event */,
+                        cl_profiling_info   /* param_name */,
+                        size_t              /* param_value_size */,
+                        void *              /* param_value */,
+                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+                                
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
+                    cl_mem              /* buffer */,
+                    cl_bool             /* blocking_read */,
+                    size_t              /* offset */,
+                    size_t              /* size */, 
+                    void *              /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
+                        cl_mem              /* buffer */,
+                        cl_bool             /* blocking_read */,
+                        const size_t *      /* buffer_offset */,
+                        const size_t *      /* host_offset */, 
+                        const size_t *      /* region */,
+                        size_t              /* buffer_row_pitch */,
+                        size_t              /* buffer_slice_pitch */,
+                        size_t              /* host_row_pitch */,
+                        size_t              /* host_slice_pitch */,                        
+                        void *              /* ptr */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   /* command_queue */, 
+                     cl_mem             /* buffer */, 
+                     cl_bool            /* blocking_write */, 
+                     size_t             /* offset */, 
+                     size_t             /* size */, 
+                     const void *       /* ptr */, 
+                     cl_uint            /* num_events_in_wait_list */, 
+                     const cl_event *   /* event_wait_list */, 
+                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
+                         cl_mem              /* buffer */,
+                         cl_bool             /* blocking_write */,
+                         const size_t *      /* buffer_offset */,
+                         const size_t *      /* host_offset */, 
+                         const size_t *      /* region */,
+                         size_t              /* buffer_row_pitch */,
+                         size_t              /* buffer_slice_pitch */,
+                         size_t              /* host_row_pitch */,
+                         size_t              /* host_slice_pitch */,                        
+                         const void *        /* ptr */,
+                         cl_uint             /* num_events_in_wait_list */,
+                         const cl_event *    /* event_wait_list */,
+                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue   /* command_queue */,
+                    cl_mem             /* buffer */, 
+                    const void *       /* pattern */, 
+                    size_t             /* pattern_size */, 
+                    size_t             /* offset */, 
+                    size_t             /* size */, 
+                    cl_uint            /* num_events_in_wait_list */, 
+                    const cl_event *   /* event_wait_list */, 
+                    cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    /* command_queue */, 
+                    cl_mem              /* src_buffer */,
+                    cl_mem              /* dst_buffer */, 
+                    size_t              /* src_offset */,
+                    size_t              /* dst_offset */,
+                    size_t              /* size */, 
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */, 
+                        cl_mem              /* src_buffer */,
+                        cl_mem              /* dst_buffer */, 
+                        const size_t *      /* src_origin */,
+                        const size_t *      /* dst_origin */,
+                        const size_t *      /* region */, 
+                        size_t              /* src_row_pitch */,
+                        size_t              /* src_slice_pitch */,
+                        size_t              /* dst_row_pitch */,
+                        size_t              /* dst_slice_pitch */,
+                        cl_uint             /* num_events_in_wait_list */,
+                        const cl_event *    /* event_wait_list */,
+                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* image */,
+                   cl_bool              /* blocking_read */, 
+                   const size_t *       /* origin[3] */,
+                   const size_t *       /* region[3] */,
+                   size_t               /* row_pitch */,
+                   size_t               /* slice_pitch */, 
+                   void *               /* ptr */,
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    /* command_queue */,
+                    cl_mem              /* image */,
+                    cl_bool             /* blocking_write */, 
+                    const size_t *      /* origin[3] */,
+                    const size_t *      /* region[3] */,
+                    size_t              /* input_row_pitch */,
+                    size_t              /* input_slice_pitch */, 
+                    const void *        /* ptr */,
+                    cl_uint             /* num_events_in_wait_list */,
+                    const cl_event *    /* event_wait_list */,
+                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue   /* command_queue */,
+                   cl_mem             /* image */, 
+                   const void *       /* fill_color */, 
+                   const size_t *     /* origin[3] */, 
+                   const size_t *     /* region[3] */, 
+                   cl_uint            /* num_events_in_wait_list */, 
+                   const cl_event *   /* event_wait_list */, 
+                   cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
+                            
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     /* command_queue */,
+                   cl_mem               /* src_image */,
+                   cl_mem               /* dst_image */, 
+                   const size_t *       /* src_origin[3] */,
+                   const size_t *       /* dst_origin[3] */,
+                   const size_t *       /* region[3] */, 
+                   cl_uint              /* num_events_in_wait_list */,
+                   const cl_event *     /* event_wait_list */,
+                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_image */,
+                           cl_mem           /* dst_buffer */, 
+                           const size_t *   /* src_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           size_t           /* dst_offset */,
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
+                           cl_mem           /* src_buffer */,
+                           cl_mem           /* dst_image */, 
+                           size_t           /* src_offset */,
+                           const size_t *   /* dst_origin[3] */,
+                           const size_t *   /* region[3] */, 
+                           cl_uint          /* num_events_in_wait_list */,
+                           const cl_event * /* event_wait_list */,
+                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue /* command_queue */,
+                   cl_mem           /* buffer */,
+                   cl_bool          /* blocking_map */, 
+                   cl_map_flags     /* map_flags */,
+                   size_t           /* offset */,
+                   size_t           /* size */,
+                   cl_uint          /* num_events_in_wait_list */,
+                   const cl_event * /* event_wait_list */,
+                   cl_event *       /* event */,
+                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  /* command_queue */,
+                  cl_mem            /* image */, 
+                  cl_bool           /* blocking_map */, 
+                  cl_map_flags      /* map_flags */, 
+                  const size_t *    /* origin[3] */,
+                  const size_t *    /* region[3] */,
+                  size_t *          /* image_row_pitch */,
+                  size_t *          /* image_slice_pitch */,
+                  cl_uint           /* num_events_in_wait_list */,
+                  const cl_event *  /* event_wait_list */,
+                  cl_event *        /* event */,
+                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
+                        cl_mem           /* memobj */,
+                        void *           /* mapped_ptr */,
+                        cl_uint          /* num_events_in_wait_list */,
+                        const cl_event *  /* event_wait_list */,
+                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue       /* command_queue */,
+                           cl_uint                /* num_mem_objects */,
+                           const cl_mem *         /* mem_objects */,
+                           cl_mem_migration_flags /* flags */,
+                           cl_uint                /* num_events_in_wait_list */,
+                           const cl_event *       /* event_wait_list */,
+                           cl_event *             /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  /* command_queue */,
+              cl_kernel         /* kernel */,
+              cl_uint           /* num_events_in_wait_list */,
+              const cl_event *  /* event_wait_list */,
+              cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
+					  void (CL_CALLBACK * /*user_func*/)(void *), 
+                      void *            /* args */,
+                      size_t            /* cb_args */, 
+                      cl_uint           /* num_mem_objects */,
+                      const cl_mem *    /* mem_list */,
+                      const void **     /* args_mem_loc */,
+                      cl_uint           /* num_events_in_wait_list */,
+                      const cl_event *  /* event_wait_list */,
+                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */,
+                            cl_uint           /* num_events_in_wait_list */,
+                            const cl_event *  /* event_wait_list */,
+                            cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */,
+                             cl_uint           /* num_events_in_wait_list */,
+                             const cl_event *  /* event_wait_list */,
+                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
+
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found.  The client must
+ * check to make sure the address is not NULL, before using or 
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL 
+clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
+                                         const char *   /* func_name */) CL_API_SUFFIX__VERSION_1_2;
+    
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */,
+                size_t                  /* image_height */,
+                size_t                  /* image_row_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context              /* context */,
+                cl_mem_flags            /* flags */,
+                const cl_image_format * /* image_format */,
+                size_t                  /* image_width */, 
+                size_t                  /* image_height */,
+                size_t                  /* image_depth */, 
+                size_t                  /* image_row_pitch */, 
+                size_t                  /* image_slice_pitch */, 
+                void *                  /* host_ptr */,
+                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    /* command_queue */,
+                cl_event *          /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
+                        cl_uint          /* num_events */,
+                        const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_H */
+
diff --git a/include/CL/cl.hpp b/include/CL/cl.hpp
new file mode 100644
index 0000000..38fac19
--- /dev/null
+++ b/include/CL/cl.hpp
@@ -0,0 +1,12452 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*! \file
+ *
+ *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and 
+ *       OpenCL 1.2 (rev 15)    
+ *   \author Benedict R. Gaster, Laurent Morichetti and Lee Howes
+ *   
+ *   Additions and fixes from:
+ *       Brian Cole, March 3rd 2010 and April 2012 
+ *       Matt Gruenke, April 2012.
+ *       Bruce Merry, February 2013.
+ *       Tom Deakin and Simon McIntosh-Smith, July 2013
+ *   
+ *   \version 1.2.6
+ *   \date August 2013
+ *
+ *   Optional extension support
+ *
+ *         cl
+ *         cl_ext_device_fission
+ *				#define USE_CL_DEVICE_FISSION
+ */
+
+/*! \mainpage
+ * \section intro Introduction
+ * For many large applications C++ is the language of choice and so it seems
+ * reasonable to define C++ bindings for OpenCL.
+ *
+ *
+ * The interface is contained with a single C++ header file \em cl.hpp and all
+ * definitions are contained within the namespace \em cl. There is no additional
+ * requirement to include \em cl.h and to use either the C++ or original C
+ * bindings it is enough to simply include \em cl.hpp.
+ *
+ * The bindings themselves are lightweight and correspond closely to the
+ * underlying C API. Using the C++ bindings introduces no additional execution
+ * overhead.
+ *
+ * For detail documentation on the bindings see:
+ *
+ * The OpenCL C++ Wrapper API 1.2 (revision 09)
+ *  http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf
+ *
+ * \section example Example
+ *
+ * The following example shows a general use case for the C++
+ * bindings, including support for the optional exception feature and
+ * also the supplied vector and string classes, see following sections for
+ * decriptions of these features.
+ *
+ * \code
+ * #define __CL_ENABLE_EXCEPTIONS
+ * 
+ * #if defined(__APPLE__) || defined(__MACOSX)
+ * #include <OpenCL/cl.hpp>
+ * #else
+ * #include <CL/cl.hpp>
+ * #endif
+ * #include <cstdio>
+ * #include <cstdlib>
+ * #include <iostream>
+ * 
+ *  const char * helloStr  = "__kernel void "
+ *                           "hello(void) "
+ *                           "{ "
+ *                           "  "
+ *                           "} ";
+ * 
+ *  int
+ *  main(void)
+ *  {
+ *     cl_int err = CL_SUCCESS;
+ *     try {
+ *
+ *       std::vector<cl::Platform> platforms;
+ *       cl::Platform::get(&platforms);
+ *       if (platforms.size() == 0) {
+ *           std::cout << "Platform size 0\n";
+ *           return -1;
+ *       }
+ *
+ *       cl_context_properties properties[] = 
+ *          { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
+ *       cl::Context context(CL_DEVICE_TYPE_CPU, properties); 
+ * 
+ *       std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+ * 
+ *       cl::Program::Sources source(1,
+ *           std::make_pair(helloStr,strlen(helloStr)));
+ *       cl::Program program_ = cl::Program(context, source);
+ *       program_.build(devices);
+ * 
+ *       cl::Kernel kernel(program_, "hello", &err);
+ * 
+ *       cl::Event event;
+ *       cl::CommandQueue queue(context, devices[0], 0, &err);
+ *       queue.enqueueNDRangeKernel(
+ *           kernel, 
+ *           cl::NullRange, 
+ *           cl::NDRange(4,4),
+ *           cl::NullRange,
+ *           NULL,
+ *           &event); 
+ * 
+ *       event.wait();
+ *     }
+ *     catch (cl::Error err) {
+ *        std::cerr 
+ *           << "ERROR: "
+ *           << err.what()
+ *           << "("
+ *           << err.err()
+ *           << ")"
+ *           << std::endl;
+ *     }
+ * 
+ *    return EXIT_SUCCESS;
+ *  }
+ * 
+ * \endcode
+ *
+ */
+#ifndef CL_HPP_
+#define CL_HPP_
+
+#ifdef _WIN32
+
+#include <windows.h>
+#include <malloc.h>
+#include <iterator>
+#include <intrin.h>
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+#include <exception>
+#endif // #if defined(__CL_ENABLE_EXCEPTIONS)
+
+#pragma push_macro("max")
+#undef max
+#if defined(USE_DX_INTEROP)
+#include <CL/cl_d3d10.h>
+#include <CL/cl_dx9_media_sharing.h>
+#endif
+#endif // _WIN32
+
+// 
+#if defined(USE_CL_DEVICE_FISSION)
+#include <CL/cl_ext.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenGL/OpenGL.h>
+#include <OpenCL/opencl.h>
+#include <libkern/OSAtomic.h>
+#else
+#include <GL/gl.h>
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
+// To avoid accidentally taking ownership of core OpenCL types
+// such as cl_kernel constructors are made explicit
+// under OpenCL 1.2
+#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS explicit
+#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS 
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+// Define deprecated prefixes and suffixes to ensure compilation
+// in case they are not pre-defined
+#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif //CL_CALLBACK
+
+#include <utility>
+#include <limits>
+
+#if !defined(__NO_STD_VECTOR)
+#include <vector>
+#endif
+
+#if !defined(__NO_STD_STRING)
+#include <string>
+#endif 
+
+#if defined(linux) || defined(__APPLE__) || defined(__MACOSX)
+#include <alloca.h>
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif // linux
+
+#include <cstring>
+
+
+/*! \namespace cl
+ *
+ * \brief The OpenCL C++ bindings are defined within this namespace.
+ *
+ */
+namespace cl {
+
+class Memory;
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __INIT_CL_EXT_FCN_PTR(name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddress(#name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+#if defined(CL_VERSION_1_2)
+#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddressForPlatform(platform, #name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+class Program;
+class Device;
+class Context;
+class CommandQueue;
+class Memory;
+class Buffer;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+/*! \brief Exception class 
+ * 
+ *  This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined.
+ */
+class Error : public std::exception
+{
+private:
+    cl_int err_;
+    const char * errStr_;
+public:
+    /*! \brief Create a new CL error exception for a given error code
+     *  and corresponding message.
+     * 
+     *  \param err error code value.
+     *
+     *  \param errStr a descriptive string that must remain in scope until
+     *                handling of the exception has concluded.  If set, it
+     *                will be returned by what().
+     */
+    Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+    {}
+
+    ~Error() throw() {}
+
+    /*! \brief Get error string associated with exception
+     *
+     * \return A memory pointer to the error message string.
+     */
+    virtual const char * what() const throw ()
+    {
+        if (errStr_ == NULL) {
+            return "empty";
+        }
+        else {
+            return errStr_;
+        }
+    }
+
+    /*! \brief Get error code associated with exception
+     *
+     *  \return The error code.
+     */
+    cl_int err(void) const { return err_; }
+};
+
+#define __ERR_STR(x) #x
+#else
+#define __ERR_STR(x) NULL
+#endif // __CL_ENABLE_EXCEPTIONS
+
+
+namespace detail
+{
+#if defined(__CL_ENABLE_EXCEPTIONS)
+static inline cl_int errHandler (
+    cl_int err,
+    const char * errStr = NULL)
+{
+    if (err != CL_SUCCESS) {
+        throw Error(err, errStr);
+    }
+    return err;
+}
+#else
+static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+{
+    (void) errStr; // suppress unused variable warning
+    return err;
+}
+#endif // __CL_ENABLE_EXCEPTIONS
+}
+
+
+
+//! \cond DOXYGEN_DETAIL
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#define __GET_DEVICE_INFO_ERR               __ERR_STR(clGetDeviceInfo)
+#define __GET_PLATFORM_INFO_ERR             __ERR_STR(clGetPlatformInfo)
+#define __GET_DEVICE_IDS_ERR                __ERR_STR(clGetDeviceIDs)
+#define __GET_PLATFORM_IDS_ERR              __ERR_STR(clGetPlatformIDs)
+#define __GET_CONTEXT_INFO_ERR              __ERR_STR(clGetContextInfo)
+#define __GET_EVENT_INFO_ERR                __ERR_STR(clGetEventInfo)
+#define __GET_EVENT_PROFILE_INFO_ERR        __ERR_STR(clGetEventProfileInfo)
+#define __GET_MEM_OBJECT_INFO_ERR           __ERR_STR(clGetMemObjectInfo)
+#define __GET_IMAGE_INFO_ERR                __ERR_STR(clGetImageInfo)
+#define __GET_SAMPLER_INFO_ERR              __ERR_STR(clGetSamplerInfo)
+#define __GET_KERNEL_INFO_ERR               __ERR_STR(clGetKernelInfo)
+#if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_ARG_INFO_ERR               __ERR_STR(clGetKernelArgInfo)
+#endif // #if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_WORK_GROUP_INFO_ERR    __ERR_STR(clGetKernelWorkGroupInfo)
+#define __GET_PROGRAM_INFO_ERR              __ERR_STR(clGetProgramInfo)
+#define __GET_PROGRAM_BUILD_INFO_ERR        __ERR_STR(clGetProgramBuildInfo)
+#define __GET_COMMAND_QUEUE_INFO_ERR        __ERR_STR(clGetCommandQueueInfo)
+
+#define __CREATE_CONTEXT_ERR                __ERR_STR(clCreateContext)
+#define __CREATE_CONTEXT_FROM_TYPE_ERR      __ERR_STR(clCreateContextFromType)
+#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   __ERR_STR(clGetSupportedImageFormats)
+
+#define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
+#define __COPY_ERR                          __ERR_STR(cl::copy)
+#define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
+#define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
+#define __CREATE_GL_RENDER_BUFFER_ERR       __ERR_STR(clCreateFromGLBuffer)
+#define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_IMAGE_ERR                  __ERR_STR(clCreateImage)
+#define __CREATE_GL_TEXTURE_ERR             __ERR_STR(clCreateFromGLTexture)
+#define __IMAGE_DIMENSION_ERR               __ERR_STR(Incorrect image dimensions)
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_SAMPLER_ERR                __ERR_STR(clCreateSampler)
+#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
+
+#define __CREATE_USER_EVENT_ERR             __ERR_STR(clCreateUserEvent)
+#define __SET_USER_EVENT_STATUS_ERR         __ERR_STR(clSetUserEventStatus)
+#define __SET_EVENT_CALLBACK_ERR            __ERR_STR(clSetEventCallback)
+#define __WAIT_FOR_EVENTS_ERR               __ERR_STR(clWaitForEvents)
+
+#define __CREATE_KERNEL_ERR                 __ERR_STR(clCreateKernel)
+#define __SET_KERNEL_ARGS_ERR               __ERR_STR(clSetKernelArg)
+#define __CREATE_PROGRAM_WITH_SOURCE_ERR    __ERR_STR(clCreateProgramWithSource)
+#define __CREATE_PROGRAM_WITH_BINARY_ERR    __ERR_STR(clCreateProgramWithBinary)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    __ERR_STR(clCreateProgramWithBuiltInKernels)
+#endif // #if defined(CL_VERSION_1_2)
+#define __BUILD_PROGRAM_ERR                 __ERR_STR(clBuildProgram)
+#if defined(CL_VERSION_1_2)
+#define __COMPILE_PROGRAM_ERR                  __ERR_STR(clCompileProgram)
+
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_KERNELS_IN_PROGRAM_ERR     __ERR_STR(clCreateKernelsInProgram)
+
+#define __CREATE_COMMAND_QUEUE_ERR          __ERR_STR(clCreateCommandQueue)
+#define __SET_COMMAND_QUEUE_PROPERTY_ERR    __ERR_STR(clSetCommandQueueProperty)
+#define __ENQUEUE_READ_BUFFER_ERR           __ERR_STR(clEnqueueReadBuffer)
+#define __ENQUEUE_READ_BUFFER_RECT_ERR      __ERR_STR(clEnqueueReadBufferRect)
+#define __ENQUEUE_WRITE_BUFFER_ERR          __ERR_STR(clEnqueueWriteBuffer)
+#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     __ERR_STR(clEnqueueWriteBufferRect)
+#define __ENQEUE_COPY_BUFFER_ERR            __ERR_STR(clEnqueueCopyBuffer)
+#define __ENQEUE_COPY_BUFFER_RECT_ERR       __ERR_STR(clEnqueueCopyBufferRect)
+#define __ENQUEUE_FILL_BUFFER_ERR           __ERR_STR(clEnqueueFillBuffer)
+#define __ENQUEUE_READ_IMAGE_ERR            __ERR_STR(clEnqueueReadImage)
+#define __ENQUEUE_WRITE_IMAGE_ERR           __ERR_STR(clEnqueueWriteImage)
+#define __ENQUEUE_COPY_IMAGE_ERR            __ERR_STR(clEnqueueCopyImage)
+#define __ENQUEUE_FILL_IMAGE_ERR           __ERR_STR(clEnqueueFillImage)
+#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  __ERR_STR(clEnqueueCopyImageToBuffer)
+#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  __ERR_STR(clEnqueueCopyBufferToImage)
+#define __ENQUEUE_MAP_BUFFER_ERR            __ERR_STR(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_IMAGE_ERR             __ERR_STR(clEnqueueMapImage)
+#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      __ERR_STR(clEnqueueUnMapMemObject)
+#define __ENQUEUE_NDRANGE_KERNEL_ERR        __ERR_STR(clEnqueueNDRangeKernel)
+#define __ENQUEUE_TASK_ERR                  __ERR_STR(clEnqueueTask)
+#define __ENQUEUE_NATIVE_KERNEL             __ERR_STR(clEnqueueNativeKernel)
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   __ERR_STR(clEnqueueMigrateMemObjects)
+#endif // #if defined(CL_VERSION_1_2)
+
+#define __ENQUEUE_ACQUIRE_GL_ERR            __ERR_STR(clEnqueueAcquireGLObjects)
+#define __ENQUEUE_RELEASE_GL_ERR            __ERR_STR(clEnqueueReleaseGLObjects)
+
+
+#define __RETAIN_ERR                        __ERR_STR(Retain Object)
+#define __RELEASE_ERR                       __ERR_STR(Release Object)
+#define __FLUSH_ERR                         __ERR_STR(clFlush)
+#define __FINISH_ERR                        __ERR_STR(clFinish)
+#define __VECTOR_CAPACITY_ERR               __ERR_STR(Vector capacity error)
+
+/**
+ * CL 1.2 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_2)
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevices)
+#else
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevicesEXT)
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __ENQUEUE_MARKER_ERR                __ERR_STR(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       __ERR_STR(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR               __ERR_STR(clEnqueueBarrier)
+#define __UNLOAD_COMPILER_ERR               __ERR_STR(clUnloadCompiler)
+#define __CREATE_GL_TEXTURE_2D_ERR          __ERR_STR(clCreateFromGLTexture2D)
+#define __CREATE_GL_TEXTURE_3D_ERR          __ERR_STR(clCreateFromGLTexture3D)
+#define __CREATE_IMAGE2D_ERR                __ERR_STR(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR                __ERR_STR(clCreateImage3D)
+#endif // #if defined(CL_VERSION_1_1)
+
+#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
+//! \endcond
+
+/**
+ * CL 1.2 marker and barrier commands
+ */
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR                __ERR_STR(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               __ERR_STR(clEnqueueBarrierWithWaitList)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
+typedef std::string STRING_CLASS;
+#elif !defined(__USE_DEV_STRING) 
+
+/*! \class string
+ * \brief Simple string class, that provides a limited subset of std::string
+ * functionality but avoids many of the issues that come with that class.
+ 
+ *  \note Deprecated. Please use std::string as default or
+ *  re-define the string class to match the std::string
+ *  interface by defining STRING_CLASS
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+{
+private:
+    ::size_t size_;
+    char * str_;
+public:
+    //! \brief Constructs an empty string, allocating no memory.
+    string(void) : size_(0), str_(NULL)
+    {
+    }
+
+    /*! \brief Constructs a string populated from an arbitrary value of
+     *  specified size.
+     * 
+     *  An extra '\0' is added, in case none was contained in str.
+     *
+     *  \param str the initial value of the string instance.  Note that '\0'     
+     *             characters receive no special treatment.  If NULL,
+     *             the string is left empty, with a size of 0.
+     *
+     *  \param size the number of characters to copy from str.
+     */
+    string(const char * str, ::size_t size) :
+        size_(size),
+        str_(NULL)
+    {
+        if( size > 0 ) {
+            str_ = new char[size_+1];
+            if (str_ != NULL) {
+                memcpy(str_, str, size_  * sizeof(char));
+                str_[size_] = '\0';
+            }
+            else {
+                size_ = 0;
+            }
+        }
+    }
+
+    /*! \brief Constructs a string populated from a null-terminated value.
+     *
+     *  \param str the null-terminated initial value of the string instance.
+     *             If NULL, the string is left empty, with a size of 0.
+     */
+    string(const char * str) :
+        size_(0),
+        str_(NULL)
+    {
+        if( str ) {
+            size_= ::strlen(str);
+        }
+        if( size_ > 0 ) {
+            str_ = new char[size_ + 1];
+            if (str_ != NULL) {
+                memcpy(str_, str, (size_ + 1) * sizeof(char));
+            }
+        }
+    }
+
+    void resize( ::size_t n )
+    {
+        if( size_ == n ) {
+            return;
+        }
+        if (n == 0) {
+            if( str_ ) {
+                delete [] str_;
+            }
+            str_ = NULL;
+            size_ = 0;
+        } 
+        else {
+            char *newString = new char[n + 1];
+            int copySize = n;
+            if( size_ < n ) {
+                copySize = size_;
+            }
+            size_ = n;
+            
+            if(str_) {
+                memcpy(newString, str_, (copySize + 1) * sizeof(char));
+            }
+            if( copySize < size_ ) {
+                memset(newString + copySize, 0, size_ - copySize);
+            }
+            newString[size_] = '\0';
+
+            delete [] str_;
+            str_ = newString;
+        }
+    }
+
+    const char& operator[] ( ::size_t pos ) const
+    {
+        return str_[pos];
+    }
+
+    char& operator[] ( ::size_t pos )
+    {
+        return str_[pos];
+    }
+
+    /*! \brief Copies the value of another string to this one.
+     *
+     *  \param rhs the string to copy.
+     *
+     *  \returns a reference to the modified instance.
+     */
+    string& operator=(const string& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if( str_ != NULL ) {
+            delete [] str_;
+            str_ = NULL;
+            size_ = 0;
+        }
+
+        if (rhs.size_ == 0 || rhs.str_ == NULL) {
+            str_ = NULL;
+            size_ = 0;
+        } 
+        else {
+            str_ = new char[rhs.size_ + 1];
+            size_ = rhs.size_;
+            
+            if (str_ != NULL) {
+                memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
+            }
+            else {
+                size_ = 0;
+            }
+        }
+
+        return *this;
+    }
+
+    /*! \brief Constructs a string by copying the value of another instance.
+     *
+     *  \param rhs the string to copy.
+     */
+    string(const string& rhs) :
+        size_(0),
+        str_(NULL)
+    {
+        *this = rhs;
+    }
+
+    //! \brief Destructor - frees memory used to hold the current value.
+    ~string()
+    {
+        delete[] str_;
+        str_ = NULL;
+    }
+    
+    //! \brief Queries the length of the string, excluding any added '\0's.
+    ::size_t size(void) const   { return size_; }
+
+    //! \brief Queries the length of the string, excluding any added '\0's.
+    ::size_t length(void) const { return size(); }
+
+    /*! \brief Returns a pointer to the private copy held by this instance,
+     *  or "" if empty/unset.
+     */
+    const char * c_str(void) const { return (str_) ? str_ : "";}
+};
+typedef cl::string STRING_CLASS;
+#endif // #elif !defined(__USE_DEV_STRING) 
+
+#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+#define VECTOR_CLASS std::vector
+#elif !defined(__USE_DEV_VECTOR) 
+#define VECTOR_CLASS cl::vector 
+
+#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
+#define __MAX_DEFAULT_VECTOR_SIZE 10
+#endif
+
+/*! \class vector
+ * \brief Fixed sized vector implementation that mirroring 
+ *
+ *  \note Deprecated. Please use std::vector as default or
+ *  re-define the vector class to match the std::vector
+ *  interface by defining VECTOR_CLASS
+
+ *  \note Not recommended for use with custom objects as
+ *  current implementation will construct N elements
+ *
+ * std::vector functionality.
+ *  \brief Fixed sized vector compatible with std::vector.
+ *
+ *  \note
+ *  This differs from std::vector<> not just in memory allocation,
+ *  but also in terms of when members are constructed, destroyed,
+ *  and assigned instead of being copy constructed.
+ *
+ *  \param T type of element contained in the vector.
+ *
+ *  \param N maximum size of the vector.
+ */
+template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+{
+private:
+    T data_[N];
+    unsigned int size_;
+
+public:
+    //! \brief Constructs an empty vector with no memory allocated.
+    vector() :  
+        size_(static_cast<unsigned int>(0))
+    {}
+
+    //! \brief Deallocates the vector's memory and destroys all of its elements.
+    ~vector() 
+    {
+        clear();
+    }
+
+    //! \brief Returns the number of elements currently contained.
+    unsigned int size(void) const
+    {
+        return size_;
+    }
+    
+    /*! \brief Empties the vector of all elements.
+     *  \note
+     *  This does not deallocate memory but will invoke destructors
+     *  on contained elements.
+     */
+    void clear()
+    {
+        while(!empty()) {
+            pop_back();
+        }
+    }
+
+    /*! \brief Appends an element after the last valid element.
+     * Calling this on a vector that has reached capacity will throw an 
+     * exception if exceptions are enabled.
+     */
+    void push_back (const T& x)
+    { 
+        if (size() < N) {    
+            new (&data_[size_]) T(x);
+            size_++;
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+    }
+
+    /*! \brief Removes the last valid element from the vector.
+     * Calling this on an empty vector will throw an exception
+     * if exceptions are enabled.
+     */
+    void pop_back(void)
+    {
+        if (size_ != 0) {
+            --size_;
+            data_[size_].~T();
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+    }
+  
+    /*! \brief Constructs with a value copied from another.
+     *
+     *  \param vec the vector to copy.
+     */
+    vector(const vector<T, N>& vec) : 
+        size_(vec.size_)
+    {
+        if (size_ != 0) {	
+            assign(vec.begin(), vec.end());
+        }
+    } 
+
+    /*! \brief Constructs with a specified number of initial elements.
+     *
+     *  \param size number of initial elements.
+     *
+     *  \param val value of initial elements.
+     */
+    vector(unsigned int size, const T& val = T()) :
+        size_(0)
+    {
+        for (unsigned int i = 0; i < size; i++) {
+            push_back(val);
+        }
+    }
+
+    /*! \brief Overwrites the current content with that copied from another
+     *         instance.
+     *
+     *  \param rhs vector to copy.
+     *
+     *  \returns a reference to this.
+     */
+    vector<T, N>& operator=(const vector<T, N>& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if (rhs.size_ != 0) {	
+            assign(rhs.begin(), rhs.end());
+        } else {
+            clear();
+        }
+    
+        return *this;
+    }
+
+    /*! \brief Tests equality against another instance.
+     *
+     *  \param vec the vector against which to compare.
+     */
+    bool operator==(vector<T,N> &vec)
+    {
+        if (size() != vec.size()) {
+            return false;
+        }
+
+        for( unsigned int i = 0; i < size(); ++i ) {
+            if( operator[](i) != vec[i] ) {
+                return false;
+            }
+        }
+        return true;
+    }
+  
+    //! \brief Conversion operator to T*.
+    operator T* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const T* () const { return data_; }
+   
+    //! \brief Tests whether this instance has any elements.
+    bool empty (void) const
+    {
+        return size_==0;
+    }
+  
+    //! \brief Returns the maximum number of elements this instance can hold.
+    unsigned int max_size (void) const
+    {
+        return N;
+    }
+
+    //! \brief Returns the maximum number of elements this instance can hold.
+    unsigned int capacity () const
+    {
+        return N;
+    }
+
+    /*! \brief Returns a reference to a given element.
+     *
+     *  \param index which element to access.     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    T& operator[](int index)
+    {
+        return data_[index];
+    }
+  
+    /*! \brief Returns a const reference to a given element.
+     *
+     *  \param index which element to access.
+     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    const T& operator[](int index) const
+    {
+        return data_[index];
+    }
+  
+    /*! \brief Assigns elements of the vector based on a source iterator range.
+     *
+     *  \param start Beginning iterator of source range
+     *  \param end Enditerator of source range
+     *
+     *  \note
+     *  Will throw an exception if exceptions are enabled and size exceeded.
+     */
+    template<class I>
+    void assign(I start, I end)
+    {
+        clear();   
+        while(start != end) {
+            push_back(*start);
+            start++;
+        }
+    }
+
+    /*! \class iterator
+     * \brief Const iterator class for vectors
+     */
+    class iterator
+    {
+    private:
+        const vector<T,N> *vec_;
+        int index_;
+
+        /**
+         * Internal iterator constructor to capture reference
+         * to the vector it iterates over rather than taking 
+         * the vector by copy.
+         */
+        iterator (const vector<T,N> &vec, int index) :
+            vec_(&vec)
+        {            
+            if( !vec.empty() ) {
+                index_ = index;
+            } else {
+                index_ = -1;
+            }
+        }
+
+    public:
+        iterator(void) : 
+            index_(-1),
+            vec_(NULL)
+        {
+        }
+
+        iterator(const iterator& rhs) :
+            vec_(rhs.vec_),
+            index_(rhs.index_)
+        {
+        }
+
+        ~iterator(void) {}
+
+        static iterator begin(const cl::vector<T,N> &vec)
+        {
+            iterator i(vec, 0);
+
+            return i;
+        }
+
+        static iterator end(const cl::vector<T,N> &vec)
+        {
+            iterator i(vec, vec.size());
+
+            return i;
+        }
+    
+        bool operator==(iterator i)
+        {
+            return ((vec_ == i.vec_) && 
+                    (index_ == i.index_));
+        }
+
+        bool operator!=(iterator i)
+        {
+            return (!(*this==i));
+        }
+
+        iterator& operator++()
+        {
+            ++index_;
+            return *this;
+        }
+
+        iterator operator++(int)
+        {
+            iterator retVal(*this);
+            ++index_;
+            return retVal;
+        }
+
+        iterator& operator--()
+        {
+            --index_;
+            return *this;
+        }
+
+        iterator operator--(int)
+        {
+            iterator retVal(*this);
+            --index_;
+            return retVal;
+        }
+
+        const T& operator *() const
+        {
+            return (*vec_)[index_];
+        }
+    };
+
+    iterator begin(void)
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator begin(void) const
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator end(void)
+    {
+        return iterator::end(*this);
+    }
+
+    iterator end(void) const
+    {
+        return iterator::end(*this);
+    }
+
+    T& front(void)
+    {
+        return data_[0];
+    }
+
+    T& back(void)
+    {
+        return data_[size_];
+    }
+
+    const T& front(void) const
+    {
+        return data_[0];
+    }
+
+    const T& back(void) const
+    {
+        return data_[size_-1];
+    }
+};  
+#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+
+
+
+
+
+namespace detail {
+#define __DEFAULT_NOT_INITIALIZED 1 
+#define __DEFAULT_BEING_INITIALIZED 2
+#define __DEFAULT_INITIALIZED 4
+
+    /*
+     * Compare and exchange primitives are needed for handling of defaults
+    */
+    inline int compare_exchange(volatile int * dest, int exchange, int comparand)
+    {
+#ifdef _WIN32
+        return (int)(InterlockedCompareExchange(
+           (volatile long*)dest, 
+           (long)exchange, 
+           (long)comparand));
+#elif defined(__APPLE__) || defined(__MACOSX)
+		return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest);
+#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX)
+        return (__sync_val_compare_and_swap(
+            dest, 
+            comparand, 
+            exchange));
+#endif // !_WIN32
+    }
+
+    inline void fence() { _mm_mfence(); }
+}; // namespace detail
+
+    
+/*! \brief class used to interface between C++ and
+ *  OpenCL C calls that require arrays of size_t values, whose
+ *  size is known statically.
+ */
+template <int N>
+class size_t
+{ 
+private:
+    ::size_t data_[N];
+
+public:
+    //! \brief Initialize size_t to all 0s
+    size_t()
+    {
+        for( int i = 0; i < N; ++i ) {
+            data_[i] = 0;
+        }
+    }
+
+    ::size_t& operator[](int index)
+    {
+        return data_[index];
+    }
+
+    const ::size_t& operator[](int index) const
+    {
+        return data_[index];
+    }
+
+    //! \brief Conversion operator to T*.
+    operator ::size_t* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const ::size_t* () const { return data_; }
+};
+
+namespace detail {
+
+// Generic getInfoHelper. The final parameter is used to guide overload
+// resolution: the actual parameter passed is an int, which makes this
+// a worse conversion sequence than a specialization that declares the
+// parameter as an int.
+template<typename Functor, typename T>
+inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
+{
+    return f(name, sizeof(T), param, NULL);
+}
+
+// Specialized getInfoHelper for VECTOR_CLASS params
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    T* value = (T*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    param->assign(&value[0], &value[required/sizeof(T)]);
+    return CL_SUCCESS;
+}
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    typename T::cl_type * value = (typename T::cl_type *) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t elements = required / sizeof(typename T::cl_type);
+    param->assign(&value[0], &value[elements]);
+    for (::size_t i = 0; i < elements; i++)
+    {
+        if (value[i] != NULL)
+        {
+            err = (*param)[i].retain();
+            if (err != CL_SUCCESS) {
+                return err;
+            }
+        }
+    }
+    return CL_SUCCESS;
+}
+
+// Specialized for getInfo<CL_PROGRAM_BINARIES>
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int)
+{
+    cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
+
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for STRING_CLASS params
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    char* value = (char*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    *param = value;
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for cl::size_t params
+template <typename Func, ::size_t N>
+inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t* value = (::size_t*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    for(int i = 0; i < N; ++i) {
+        (*param)[i] = value[i];
+    }
+
+    return CL_SUCCESS;
+}
+
+template<typename T> struct ReferenceHandler;
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template<typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
+{
+    typename T::cl_type value;
+    cl_int err = f(name, sizeof(value), &value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    *param = value;
+    if (value != NULL)
+    {
+        err = param->retain();
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+    return CL_SUCCESS;
+}
+
+#define __PARAM_NAME_INFO_1_0(F) \
+    F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
+    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
+    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
+    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
+    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+    F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
+    F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
+    F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
+    \
+    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
+    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
+    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
+    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \
+    \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
+    \
+    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
+    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
+    F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
+    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
+    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
+    \
+    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
+    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
+    F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
+    F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
+    \
+    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
+    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
+    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \
+    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \
+    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \
+    \
+    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
+    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
+    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
+    \
+    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \
+    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
+    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
+    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
+    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
+    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+#if defined(CL_VERSION_1_1)
+#define __PARAM_NAME_INFO_1_1(F) \
+    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \
+    \
+    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
+    F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
+#endif // CL_VERSION_1_1
+
+    
+#if defined(CL_VERSION_1_2)
+#define __PARAM_NAME_INFO_1_2(F) \
+    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \
+    \
+    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \
+    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
+    \
+    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \
+    \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>)  \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \
+    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if defined(USE_CL_DEVICE_FISSION)
+#define __PARAM_NAME_DEVICE_FISSION(F) \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
+#endif // USE_CL_DEVICE_FISSION
+
+template <typename enum_type, cl_int Name>
+struct param_traits {};
+
+#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \
+struct token;                                        \
+template<>                                           \
+struct param_traits<detail:: token,param_name>       \
+{                                                    \
+    enum { value = param_name };                     \
+    typedef T param_type;                            \
+};
+
+__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS)
+#if defined(CL_VERSION_1_1)
+__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+#if defined(CL_VERSION_1_2)
+__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+
+#if defined(USE_CL_DEVICE_FISSION)
+__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS);
+#endif // USE_CL_DEVICE_FISSION
+
+#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
+__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS)
+#endif
+
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
+#endif
+
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
+#endif
+
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_WARP_SIZE_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_GPU_OVERLAP_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
+#endif
+
+// Convenience functions
+
+template <typename Func, typename T>
+inline cl_int
+getInfo(Func f, cl_uint name, T* param)
+{
+    return getInfoHelper(f, name, param, 0);
+}
+
+template <typename Func, typename Arg0>
+struct GetInfoFunctor0
+{
+    Func f_; const Arg0& arg0_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename Arg1>
+struct GetInfoFunctor1
+{
+    Func f_; const Arg0& arg0_; const Arg1& arg1_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, arg1_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
+{
+    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template <typename Func, typename Arg0, typename Arg1, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
+{
+    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template<typename T>
+struct ReferenceHandler
+{ };
+
+#if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.2 devices do have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int retain(cl_device_id device)
+    { return ::clRetainDevice(device); }
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int release(cl_device_id device)
+    { return ::clReleaseDevice(device); }
+};
+#else // #if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.1 devices do not have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    // cl_device_id does not have retain().
+    static cl_int retain(cl_device_id)
+    { return CL_SUCCESS; }
+    // cl_device_id does not have release().
+    static cl_int release(cl_device_id)
+    { return CL_SUCCESS; }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+template <>
+struct ReferenceHandler<cl_platform_id>
+{
+    // cl_platform_id does not have retain().
+    static cl_int retain(cl_platform_id)
+    { return CL_SUCCESS; }
+    // cl_platform_id does not have release().
+    static cl_int release(cl_platform_id)
+    { return CL_SUCCESS; }
+};
+
+template <>
+struct ReferenceHandler<cl_context>
+{
+    static cl_int retain(cl_context context)
+    { return ::clRetainContext(context); }
+    static cl_int release(cl_context context)
+    { return ::clReleaseContext(context); }
+};
+
+template <>
+struct ReferenceHandler<cl_command_queue>
+{
+    static cl_int retain(cl_command_queue queue)
+    { return ::clRetainCommandQueue(queue); }
+    static cl_int release(cl_command_queue queue)
+    { return ::clReleaseCommandQueue(queue); }
+};
+
+template <>
+struct ReferenceHandler<cl_mem>
+{
+    static cl_int retain(cl_mem memory)
+    { return ::clRetainMemObject(memory); }
+    static cl_int release(cl_mem memory)
+    { return ::clReleaseMemObject(memory); }
+};
+
+template <>
+struct ReferenceHandler<cl_sampler>
+{
+    static cl_int retain(cl_sampler sampler)
+    { return ::clRetainSampler(sampler); }
+    static cl_int release(cl_sampler sampler)
+    { return ::clReleaseSampler(sampler); }
+};
+
+template <>
+struct ReferenceHandler<cl_program>
+{
+    static cl_int retain(cl_program program)
+    { return ::clRetainProgram(program); }
+    static cl_int release(cl_program program)
+    { return ::clReleaseProgram(program); }
+};
+
+template <>
+struct ReferenceHandler<cl_kernel>
+{
+    static cl_int retain(cl_kernel kernel)
+    { return ::clRetainKernel(kernel); }
+    static cl_int release(cl_kernel kernel)
+    { return ::clReleaseKernel(kernel); }
+};
+
+template <>
+struct ReferenceHandler<cl_event>
+{
+    static cl_int retain(cl_event event)
+    { return ::clRetainEvent(event); }
+    static cl_int release(cl_event event)
+    { return ::clReleaseEvent(event); }
+};
+
+
+// Extracts version number with major in the upper 16 bits, minor in the lower 16
+static cl_uint getVersion(const char *versionInfo)
+{
+    int highVersion = 0;
+    int lowVersion = 0;
+    int index = 7;
+    while(versionInfo[index] != '.' ) {
+        highVersion *= 10;
+        highVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    ++index;
+    while(versionInfo[index] != ' ' ) {
+        lowVersion *= 10;
+        lowVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    return (highVersion << 16) | lowVersion;
+}
+
+static cl_uint getPlatformVersion(cl_platform_id platform)
+{
+    ::size_t size = 0;
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+    char *versionInfo = (char *) alloca(size);
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size);
+    return getVersion(versionInfo);
+}
+
+static cl_uint getDevicePlatformVersion(cl_device_id device)
+{
+    cl_platform_id platform;
+    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    return getPlatformVersion(platform);
+}
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+static cl_uint getContextPlatformVersion(cl_context context)
+{
+    // The platform cannot be queried directly, so we first have to grab a
+    // device and obtain its context
+    ::size_t size = 0;
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+    if (size == 0)
+        return 0;
+    cl_device_id *devices = (cl_device_id *) alloca(size);
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL);
+    return getDevicePlatformVersion(devices[0]);
+}
+#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+template <typename T>
+class Wrapper
+{
+public:
+    typedef T cl_type;
+
+protected:
+    cl_type object_;
+
+public:
+    Wrapper() : object_(NULL) { }
+
+    Wrapper(const cl_type &obj) : object_(obj) { }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs.object_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        return ReferenceHandler<cl_type>::retain(object_);
+    }
+
+    cl_int release() const
+    {
+        return ReferenceHandler<cl_type>::release(object_);
+    }
+};
+
+template <>
+class Wrapper<cl_device_id>
+{
+public:
+    typedef cl_device_id cl_type;
+
+protected:
+    cl_type object_;
+    bool referenceCountable_;
+
+    static bool isReferenceCountable(cl_device_id device)
+    {
+        bool retVal = false;
+        if (device != NULL) {
+            int version = getDevicePlatformVersion(device);
+            if(version > ((1 << 16) + 1)) {
+                retVal = true;
+            }
+        }
+        return retVal;
+    }
+
+public:
+    Wrapper() : object_(NULL), referenceCountable_(false) 
+    { 
+    }
+    
+    Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) 
+    {
+        referenceCountable_ = isReferenceCountable(obj); 
+    }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+    
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = isReferenceCountable(object_); 
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs.object_;
+        referenceCountable_ = rhs.referenceCountable_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        referenceCountable_ = isReferenceCountable(object_); 
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::retain(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+
+    cl_int release() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::release(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+};
+
+} // namespace detail
+//! \endcond
+
+/*! \stuct ImageFormat
+ *  \brief Adds constructors and member functions for cl_image_format.
+ *
+ *  \see cl_image_format
+ */
+struct ImageFormat : public cl_image_format
+{
+    //! \brief Default constructor - performs no initialization.
+    ImageFormat(){}
+
+    //! \brief Initializing constructor.
+    ImageFormat(cl_channel_order order, cl_channel_type type)
+    {
+        image_channel_order = order;
+        image_channel_data_type = type;
+    }
+
+    //! \brief Assignment operator.
+    ImageFormat& operator = (const ImageFormat& rhs)
+    {
+        if (this != &rhs) {
+            this->image_channel_data_type = rhs.image_channel_data_type;
+            this->image_channel_order     = rhs.image_channel_order;
+        }
+        return *this;
+    }
+};
+
+/*! \brief Class interface for cl_device_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_device_id
+ */
+class Device : public detail::Wrapper<cl_device_id>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Device() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device(const Device& device) : detail::Wrapper<cl_type>(device) { }
+
+    /*! \brief Constructor from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { }
+
+    /*! \brief Returns the first device on the default context.
+     *
+     *  \see Context::getDefault()
+     */
+    static Device getDefault(cl_int * err = NULL);
+
+    /*! \brief Assignment operator from Device.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const Device& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const cl_device_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo().
+    template <typename T>
+    cl_int getInfo(cl_device_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetDeviceInfo, object_, name, param),
+            __GET_DEVICE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_device_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_device_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /**
+     * CL 1.2 version
+     */
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clCreateSubDevicesEXT().
+    cl_int createSubDevices(
+        const cl_device_partition_property * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        cl_uint n = 0;
+        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = clCreateSubDevices(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * CL 1.1 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_1)
+#if defined(USE_CL_DEVICE_FISSION)
+    cl_int createSubDevices(
+        const cl_device_partition_property_ext * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        typedef CL_API_ENTRY cl_int 
+            ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+                cl_device_id /*in_device*/,
+                const cl_device_partition_property_ext * /* properties */,
+                cl_uint /*num_entries*/,
+                cl_device_id * /*out_devices*/,
+                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+        __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(USE_CL_DEVICE_FISSION)
+#endif // #if defined(CL_VERSION_1_1)
+};
+
+/*! \brief Class interface for cl_platform_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_platform_id
+ */
+class Platform : public detail::Wrapper<cl_platform_id>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Platform() : detail::Wrapper<cl_type>()  { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform(const Platform& platform) : detail::Wrapper<cl_type>(platform) { }
+
+    /*! \brief Constructor from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { }
+
+    /*! \brief Assignment operator from Platform.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const Platform& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const cl_platform_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo().
+    cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetPlatformInfo, object_, name, param),
+            __GET_PLATFORM_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_platform_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_platform_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of devices for this platform.
+     * 
+     *  Wraps clGetDeviceIDs().
+     */
+    cl_int getDevices(
+        cl_device_type type,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        cl_uint n = 0;
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+#if defined(USE_DX_INTEROP)
+   /*! \brief Get the list of available D3D10 devices.
+     *
+     *  \param d3d_device_source.
+     *
+     *  \param d3d_object.
+     *
+     *  \param d3d_device_set.
+     *
+     *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
+     *  values returned in devices can be used to identify a specific OpenCL
+     *  device. If \a devices argument is NULL, this argument is ignored.
+     *
+     *  \return One of the following values:
+     *    - CL_SUCCESS if the function is executed successfully.
+     *
+     *  The application can query specific capabilities of the OpenCL device(s)
+     *  returned by cl::getDevices. This can be used by the application to
+     *  determine which device(s) to use.
+     *
+     * \note In the case that exceptions are enabled and a return value
+     * other than CL_SUCCESS is generated, then cl::Error exception is
+     * generated.
+     */
+    cl_int getDevices(
+        cl_d3d10_device_source_khr d3d_device_source,
+        void *                     d3d_object,
+        cl_d3d10_device_set_khr    d3d_device_set,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
+            cl_platform_id platform, 
+            cl_d3d10_device_source_khr d3d_device_source, 
+            void * d3d_object,
+            cl_d3d10_device_set_khr d3d_device_set,
+            cl_uint num_entries,
+            cl_device_id * devices,
+            cl_uint* num_devices);
+
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+
+        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set, 
+            0, 
+            NULL, 
+            &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set,
+            n, 
+            ids, 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif
+
+    /*! \brief Gets a list of available platforms.
+     * 
+     *  Wraps clGetPlatformIDs().
+     */
+    static cl_int get(
+        VECTOR_CLASS<Platform>* platforms)
+    {
+        cl_uint n = 0;
+
+        if( platforms == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        platforms->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static cl_int get(
+        Platform * platform)
+    {
+        cl_uint n = 0;
+
+        if( platform == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        *platform = ids[0];
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform, returning it by value.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static Platform get(
+        cl_int * errResult = NULL)
+    {
+        Platform platform;
+        cl_uint n = 0;
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+            if (errResult != NULL) {
+                *errResult = err;
+            }
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        if (errResult != NULL) {
+            *errResult = err;
+        }
+        
+        return ids[0];
+    }
+
+    static Platform getDefault( 
+        cl_int *errResult = NULL )
+    {
+        return get(errResult);
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clUnloadCompiler().
+    cl_int
+    unloadCompiler()
+    {
+        return ::clUnloadPlatformCompiler(object_);
+    }
+#endif // #if defined(CL_VERSION_1_2)
+}; // class Platform
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
+/**
+ * Unload the OpenCL compiler.
+ * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline cl_int
+UnloadCompiler()
+{
+    return ::clUnloadCompiler();
+}
+#endif // #if defined(CL_VERSION_1_1)
+
+/*! \brief Class interface for cl_context.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_context as the original.  For details, see
+ *        clRetainContext() and clReleaseContext().
+ *
+ *  \see cl_context
+ */
+class Context 
+    : public detail::Wrapper<cl_context>
+{
+private:
+    static volatile int default_initialized_;
+    static Context default_;
+    static volatile cl_int default_error_;
+public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseContext() on the value held by this instance.
+     */
+    ~Context() { }
+
+    /*! \brief Constructs a context including a list of specified devices.
+     *
+     *  Wraps clCreateContext().
+     */
+    Context(
+        const VECTOR_CLASS<Device>& devices,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        object_ = ::clCreateContext(
+            properties, (cl_uint) numDevices,
+            deviceIDs,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Context(
+        const Device& device,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_device_id deviceID = device();
+
+        object_ = ::clCreateContext(
+            properties, 1,
+            &deviceID,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a context including all or a subset of devices of a specified type.
+     *
+     *  Wraps clCreateContextFromType().
+     */
+    Context(
+        cl_device_type type,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if !defined(__APPLE__) || !defined(__MACOS)
+        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };
+
+        if (properties == NULL) {
+            // Get a valid platform ID as we cannot send in a blank one
+            VECTOR_CLASS<Platform> platforms;
+            error = Platform::get(&platforms);
+            if (error != CL_SUCCESS) {
+                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = error;
+                }
+                return;
+            }
+
+            // Check the platforms we found for a device of our specified type
+            cl_context_properties platform_id = 0;
+            for (unsigned int i = 0; i < platforms.size(); i++) {
+
+                VECTOR_CLASS<Device> devices;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+                try {
+#endif
+
+                    error = platforms[i].getDevices(type, &devices);
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+                } catch (Error) {}
+    // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
+    // We do error checking next anyway, and can throw there if needed
+#endif
+
+                // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
+                if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
+                    detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                    if (err != NULL) {
+                        *err = error;
+                    }
+                }
+
+                if (devices.size() > 0) {
+                    platform_id = (cl_context_properties)platforms[i]();
+                    break;
+                }
+            }
+
+            if (platform_id == 0) {
+                detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = CL_DEVICE_NOT_FOUND;
+                }
+                return;
+            }
+
+            prop[1] = platform_id;
+            properties = &prop[0];
+        }
+#endif
+        object_ = ::clCreateContextFromType(
+            properties, type, notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
+     *
+     *  \note All calls to this function return the same cl_context as the first.
+     */
+    static Context getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+        default_ = Context(
+            CL_DEVICE_TYPE_DEFAULT,
+            NULL,
+            NULL,
+            NULL,
+            &error);
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Context() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This calls clRetainContext() on the parameter's cl_context.
+     */
+    Context(const Context& context) : detail::Wrapper<cl_type>(context) { }
+
+    /*! \brief Constructor from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_context
+     *  into the new Context object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { }
+
+    /*! \brief Assignment operator from Context.
+     * 
+     *  This calls clRetainContext() on the parameter and clReleaseContext() on
+     *  the previous value held by this instance.
+     */
+    Context& operator = (const Context& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseContext() on the value previously held by this instance.
+     */
+    Context& operator = (const cl_context& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetContextInfo().
+    template <typename T>
+    cl_int getInfo(cl_context_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetContextInfo, object_, name, param),
+            __GET_CONTEXT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetContextInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_context_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_context_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of supported image formats.
+     *  
+     *  Wraps clGetSupportedImageFormats().
+     */
+    cl_int getSupportedImageFormats(
+        cl_mem_flags flags,
+        cl_mem_object_type type,
+        VECTOR_CLASS<ImageFormat>* formats) const
+    {
+        cl_uint numEntries;
+        cl_int err = ::clGetSupportedImageFormats(
+           object_, 
+           flags,
+           type, 
+           0, 
+           NULL, 
+           &numEntries);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        ImageFormat* value = (ImageFormat*)
+            alloca(numEntries * sizeof(ImageFormat));
+        err = ::clGetSupportedImageFormats(
+            object_, 
+            flags, 
+            type, 
+            numEntries,
+            (cl_image_format*) value, 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        formats->assign(&value[0], &value[numEntries]);
+        return CL_SUCCESS;
+    }
+};
+
+inline Device Device::getDefault(cl_int * err)
+{
+    cl_int error;
+    Device device;
+
+    Context context = Context::getDefault(&error);
+    detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+    if (error != CL_SUCCESS) {
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+    else {
+        device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+        if (err != NULL) {
+            *err = CL_SUCCESS;
+        }
+    }
+
+    return device;
+}
+
+
+#ifdef _WIN32
+__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__declspec(selectany) Context Context::default_;
+__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#else
+__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__attribute__((weak)) Context Context::default_;
+__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#endif
+
+/*! \brief Class interface for cl_event.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_event as the original.  For details, see
+ *        clRetainEvent() and clReleaseEvent().
+ *
+ *  \see cl_event
+ */
+class Event : public detail::Wrapper<cl_event>
+{
+public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseEvent() on the value held by this instance.
+     */
+    ~Event() { }
+ 
+    //! \brief Default constructor - initializes to NULL.
+    Event() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This calls clRetainEvent() on the parameter's cl_event.
+     */
+    Event(const Event& event) : detail::Wrapper<cl_type>(event) { }
+
+    /*! \brief Constructor from cl_event - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_event
+     *  into the new Event object.
+     */
+    Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { }
+
+    /*! \brief Assignment operator from cl_event - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseEvent() on the value previously held by this instance.
+     */
+    Event& operator = (const Event& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_event.
+     * 
+     *  This calls clRetainEvent() on the parameter and clReleaseEvent() on
+     *  the previous value held by this instance.
+     */
+    Event& operator = (const cl_event& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetEventInfo().
+    template <typename T>
+    cl_int getInfo(cl_event_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetEventInfo, object_, name, param),
+            __GET_EVENT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_event_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_event_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo().
+    template <typename T>
+    cl_int getProfilingInfo(cl_profiling_info name, T* param) const
+    {
+        return detail::errHandler(detail::getInfo(
+            &::clGetEventProfilingInfo, object_, name, param),
+            __GET_EVENT_PROFILE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_profiling_info, name>::param_type
+    getProfilingInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_profiling_info, name>::param_type param;
+        cl_int result = getProfilingInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Blocks the calling thread until this event completes.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    cl_int wait() const
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(1, &object_),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Registers a user callback function for a specific command execution status.
+     *
+     *  Wraps clSetEventCallback().
+     */
+    cl_int setCallback(
+        cl_int type,
+        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetEventCallback(
+                object_,
+                type,
+                pfn_notify,
+                user_data), 
+            __SET_EVENT_CALLBACK_ERR);
+    }
+#endif
+
+    /*! \brief Blocks the calling thread until every event specified is complete.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    static cl_int
+    waitForEvents(const VECTOR_CLASS<Event>& events)
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(
+                (cl_uint) events.size(), (cl_event*)&events.front()),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+};
+
+#if defined(CL_VERSION_1_1)
+/*! \brief Class interface for user events (a subset of cl_event's).
+ * 
+ *  See Event for details about copy semantics, etc.
+ */
+class UserEvent : public Event
+{
+public:
+    /*! \brief Constructs a user event on a given context.
+     *
+     *  Wraps clCreateUserEvent().
+     */
+    UserEvent(
+        const Context& context,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateUserEvent(
+            context(),
+            &error);
+
+        detail::errHandler(error, __CREATE_USER_EVENT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    UserEvent() : Event() { }
+
+    //! \brief Copy constructor - performs shallow copy.
+    UserEvent(const UserEvent& event) : Event(event) { }
+
+    //! \brief Assignment Operator - performs shallow copy.
+    UserEvent& operator = (const UserEvent& rhs)
+    {
+        if (this != &rhs) {
+            Event::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Sets the execution status of a user event object.
+     *
+     *  Wraps clSetUserEventStatus().
+     */
+    cl_int setStatus(cl_int status)
+    {
+        return detail::errHandler(
+            ::clSetUserEventStatus(object_,status), 
+            __SET_USER_EVENT_STATUS_ERR);
+    }
+};
+#endif
+
+/*! \brief Blocks the calling thread until every event specified is complete.
+ * 
+ *  Wraps clWaitForEvents().
+ */
+inline static cl_int
+WaitForEvents(const VECTOR_CLASS<Event>& events)
+{
+    return detail::errHandler(
+        ::clWaitForEvents(
+            (cl_uint) events.size(), (cl_event*)&events.front()),
+        __WAIT_FOR_EVENTS_ERR);
+}
+
+/*! \brief Class interface for cl_mem.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_mem as the original.  For details, see
+ *        clRetainMemObject() and clReleaseMemObject().
+ *
+ *  \see cl_mem
+ */
+class Memory : public detail::Wrapper<cl_mem>
+{
+public:
+ 
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseMemObject() on the value held by this instance.
+     */
+    ~Memory() {}
+
+    //! \brief Default constructor - initializes to NULL.
+    Memory() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainMemObject() on the parameter's cl_mem.
+     */
+    Memory(const Memory& memory) : detail::Wrapper<cl_type>(memory) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_mem
+     *  into the new Memory object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { }
+
+    /*! \brief Assignment operator from Memory.
+     * 
+     *  This calls clRetainMemObject() on the parameter and clReleaseMemObject()
+     *  on the previous value held by this instance.
+     */
+    Memory& operator = (const Memory& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_mem - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseMemObject() on the value previously held by this instance.
+     */
+    Memory& operator = (const cl_mem& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo().
+    template <typename T>
+    cl_int getInfo(cl_mem_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
+            __GET_MEM_OBJECT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_mem_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_mem_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Registers a callback function to be called when the memory object
+     *         is no longer needed.
+     *
+     *  Wraps clSetMemObjectDestructorCallback().
+     *
+     *  Repeated calls to this function, for a given cl_mem value, will append
+     *  to the list of functions called (in reverse order) when memory object's
+     *  resources are freed and the memory object is deleted.
+     *
+     *  \note
+     *  The registered callbacks are associated with the underlying cl_mem
+     *  value - not the Memory class instance.
+     */
+    cl_int setDestructorCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetMemObjectDestructorCallback(
+                object_,
+                pfn_notify,
+                user_data), 
+            __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
+    }
+#endif
+
+};
+
+// Pre-declare copy functions
+class Buffer;
+template< typename IteratorType >
+cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+
+
+/*! \brief Class interface for Buffer Memory Objects.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class Buffer : public Memory
+{
+public:
+
+    /*! \brief Constructs a Buffer in a specified context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     */
+    Buffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a Buffer in the default context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     *
+     *  \see Context::getDefault()
+     */
+    Buffer(
+         cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(
+        IteratorType startIterator,
+        IteratorType endIterator,
+        bool readOnly,
+        bool useHostPtr = false,
+        cl_int* err = NULL)
+    {
+        typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+        cl_int error;
+
+        cl_mem_flags flags = 0;
+        if( readOnly ) {
+            flags |= CL_MEM_READ_ONLY;
+        }
+        else {
+            flags |= CL_MEM_READ_WRITE;
+        }
+        if( useHostPtr ) {
+            flags |= CL_MEM_USE_HOST_PTR;
+        }
+        
+        ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+        Context context = Context::getDefault(err);
+
+        if( useHostPtr ) {
+            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        } else {
+            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+        }
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        if( !useHostPtr ) {
+            error = cl::copy(startIterator, endIterator, *this);
+            detail::errHandler(error, __CREATE_BUFFER_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators using a specified context.
+     * IteratorType must be random access.
+     * If useHostPtr is specified iterators must represent contiguous data.
+     */
+    template< typename IteratorType >
+    Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
+        bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
+
+    //! \brief Default constructor - initializes to NULL.
+    Buffer() : Memory() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer(const Buffer& buffer) : Memory(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { }
+
+    /*! \brief Assignment from Buffer - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer& operator = (const Buffer& rhs)
+    {
+        if (this != &rhs) {
+            Memory::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Creates a new buffer object from this.
+     *
+     *  Wraps clCreateSubBuffer().
+     */
+    Buffer createSubBuffer(
+        cl_mem_flags flags,
+        cl_buffer_create_type buffer_create_type,
+        const void * buffer_create_info,
+        cl_int * err = NULL)
+    {
+        Buffer result;
+        cl_int error;
+        result.object_ = ::clCreateSubBuffer(
+            object_, 
+            flags, 
+            buffer_create_type, 
+            buffer_create_info, 
+            &error);
+
+        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return result;
+    }		
+#endif
+};
+
+#if defined (USE_DX_INTEROP)
+/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
+ *
+ *  This is provided to facilitate interoperability with Direct3D.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class BufferD3D10 : public Buffer
+{
+public:
+    typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
+    cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
+    cl_int* errcode_ret);
+
+    /*! \brief Constructs a BufferD3D10, in a specified context, from a
+     *         given ID3D10Buffer.
+     *
+     *  Wraps clCreateFromD3D10BufferKHR().
+     */
+    BufferD3D10(
+        const Context& context,
+        cl_mem_flags flags,
+        ID3D10Buffer* bufobj,
+        cl_int * err = NULL)
+    {
+        static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
+
+#if defined(CL_VERSION_1_2)
+        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
+        cl_platform platform = -1;
+        for( int i = 0; i < props.size(); ++i ) {
+            if( props[i] == CL_CONTEXT_PLATFORM ) {
+                platform = props[i+1];
+            }
+        }
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
+#endif
+
+        cl_int error;
+        object_ = pfn_clCreateFromD3D10BufferKHR(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferD3D10() : Buffer() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferD3D10 - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const BufferD3D10& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+};
+#endif
+
+/*! \brief Class interface for GL Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferGL in a specified context, from a given
+     *         GL buffer.
+     *
+     *  Wraps clCreateFromGLBuffer().
+     */
+    BufferGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLBuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferGL() : Buffer() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const BufferGL& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief Class interface for GL Render Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferRenderGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
+     *         GL Renderbuffer.
+     *
+     *  Wraps clCreateFromGLRenderbuffer().
+     */
+    BufferRenderGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLRenderbuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferRenderGL() : Buffer() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL& operator = (const BufferRenderGL& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief C++ base class for Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image : public Memory
+{
+protected:
+    //! \brief Default constructor - initializes to NULL.
+    Image() : Memory() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image(const Image& image) : Memory(image) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { }
+
+    /*! \brief Assignment from Image - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const Image& rhs)
+    {
+        if (this != &rhs) {
+            Memory::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+public:
+    //! \brief Wrapper for clGetImageInfo().
+    template <typename T>
+    cl_int getImageInfo(cl_image_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetImageInfo, object_, name, param),
+            __GET_IMAGE_INFO_ERR);
+    }
+    
+    //! \brief Wrapper for clGetImageInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_image_info, name>::param_type
+    getImageInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_image_info, name>::param_type param;
+        cl_int result = getImageInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+/*! \brief Class interface for 1D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image1D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image1D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D,
+            width,
+            0, 0, 0, 0, 0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image1D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D(const Image1D& image1D) : Image(image1D) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { }
+
+    /*! \brief Assignment from Image1D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const Image1D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+/*! \class Image1DBuffer
+ * \brief Image interface for 1D buffer images.
+ */
+class Image1DBuffer : public Image
+{
+public:
+    Image1DBuffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        const Buffer &buffer,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_BUFFER,
+            width,
+            0, 0, 0, 0, 0, 0, 0,
+            buffer()
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            NULL, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DBuffer() { }
+
+    Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { }
+
+    Image1DBuffer& operator = (const Image1DBuffer& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Image1DBuffer& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+/*! \class Image1DArray
+ * \brief Image interface for arrays of 1D images.
+ */
+class Image1DArray : public Image
+{
+public:
+    Image1DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t rowPitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE1D_ARRAY,
+            width,
+            0, 0,  // height, depth (unused)
+            arraySize,
+            rowPitch,
+            0, 0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DArray() { }
+
+    Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image1DArray& operator = (const Image1DArray& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Image1DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+
+/*! \brief Class interface for 2D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image2D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image2D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t row_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE2D,
+                width,
+                height,
+                0, 0, // depth, array size (unused)
+                row_pitch,
+                0, 0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(),
+                flags,
+                &format,
+                &desc,
+                host_ptr,
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage2D(
+                context(), flags,&format, width, height, row_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image2D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D(const Image2D& image2D) : Image(image2D) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { }
+
+    /*! \brief Assignment from Image2D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const Image2D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 2D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
+{
+public:
+    /*! \brief Constructs an Image2DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture2D().
+     */
+    Image2DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture2D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+    
+    //! \brief Default constructor - initializes to NULL.
+    Image2DGL() : Image2D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL(const Image2DGL& image) : Image2D(image) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
+
+    /*! \brief Assignment from Image2DGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const Image2DGL& rhs)
+    {
+        if (this != &rhs) {
+            Image2D::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const cl_mem& rhs)
+    {
+        Image2D::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class Image2DArray
+ * \brief Image interface for arrays of 2D images.
+ */
+class Image2DArray : public Image
+{
+public:
+    Image2DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t height,
+        ::size_t rowPitch,
+        ::size_t slicePitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc =
+        {
+            CL_MEM_OBJECT_IMAGE2D_ARRAY,
+            width,
+            height,
+            0,       // depth (unused)
+            arraySize,
+            rowPitch,
+            slicePitch,
+            0, 0, 0
+        };
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image2DArray() { }
+
+    Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image2DArray& operator = (const Image2DArray& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Image2DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for 3D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3D : public Image
+{
+public:
+    /*! \brief Constructs a 3D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image3D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t depth,
+        ::size_t row_pitch = 0,
+        ::size_t slice_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc =
+            {
+                CL_MEM_OBJECT_IMAGE3D,
+                width,
+                height,
+                depth,
+                0,      // array size (unused)
+                row_pitch,
+                slice_pitch,
+                0, 0, 0
+            };
+            object_ = ::clCreateImage(
+                context(), 
+                flags, 
+                &format, 
+                &desc, 
+                host_ptr, 
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif  // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage3D(
+                context(), flags, &format, width, height, depth, row_pitch,
+                slice_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D(const Image3D& image3D) : Image(image3D) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { }
+
+    /*! \brief Assignment from Image3D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const Image3D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 3D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3DGL : public Image3D
+{
+public:
+    /*! \brief Constructs an Image3DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture3D().
+     */
+    Image3DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture3D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3DGL() : Image3D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL(const Image3DGL& image) : Image3D(image) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
+
+    /*! \brief Assignment from Image3DGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const Image3DGL& rhs)
+    {
+        if (this != &rhs) {
+            Image3D::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const cl_mem& rhs)
+    {
+        Image3D::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class ImageGL
+ * \brief general image interface for GL interop.
+ * We abstract the 2D and 3D GL images into a single instance here
+ * that wraps all GL sourced images on the grounds that setup information
+ * was performed by OpenCL anyway.
+ */
+class ImageGL : public Image
+{
+public:
+    ImageGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture(
+            context(), 
+            flags, 
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    ImageGL() : Image() { }
+
+    ImageGL(const ImageGL& image) : Image(image) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
+
+    ImageGL& operator = (const ImageGL& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    ImageGL& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for cl_sampler.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_sampler as the original.  For details, see
+ *        clRetainSampler() and clReleaseSampler().
+ *
+ *  \see cl_sampler 
+ */
+class Sampler : public detail::Wrapper<cl_sampler>
+{
+public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseSampler() on the value held by this instance.
+     */
+    ~Sampler() { }
+
+    //! \brief Default constructor - initializes to NULL.
+    Sampler() { }
+
+    /*! \brief Constructs a Sampler in a specified context.
+     *
+     *  Wraps clCreateSampler().
+     */
+    Sampler(
+        const Context& context,
+        cl_bool normalized_coords,
+        cl_addressing_mode addressing_mode,
+        cl_filter_mode filter_mode,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateSampler(
+            context(), 
+            normalized_coords,
+            addressing_mode,
+            filter_mode,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainSampler() on the parameter's cl_sampler.
+     */
+    Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+    /*! \brief Constructor from cl_sampler - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_sampler
+     *  into the new Sampler object.
+     */
+    Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+    /*! \brief Assignment operator from Sampler.
+     * 
+     *  This calls clRetainSampler() on the parameter and clReleaseSampler()
+     *  on the previous value held by this instance.
+     */
+    Sampler& operator = (const Sampler& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_sampler - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseSampler() on the value previously held by this instance.
+     */
+    Sampler& operator = (const cl_sampler& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo().
+    template <typename T>
+    cl_int getInfo(cl_sampler_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+            __GET_SAMPLER_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_sampler_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_sampler_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+class Program;
+class CommandQueue;
+class Kernel;
+
+//! \brief Class interface for specifying NDRange values.
+class NDRange
+{
+private:
+    size_t<3> sizes_;
+    cl_uint dimensions_;
+
+public:
+    //! \brief Default constructor - resulting range has zero dimensions.
+    NDRange()
+        : dimensions_(0)
+    { }
+
+    //! \brief Constructs one-dimensional range.
+    NDRange(::size_t size0)
+        : dimensions_(1)
+    {
+        sizes_[0] = size0;
+    }
+
+    //! \brief Constructs two-dimensional range.
+    NDRange(::size_t size0, ::size_t size1)
+        : dimensions_(2)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+    }
+
+    //! \brief Constructs three-dimensional range.
+    NDRange(::size_t size0, ::size_t size1, ::size_t size2)
+        : dimensions_(3)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+        sizes_[2] = size2;
+    }
+
+    /*! \brief Conversion operator to const ::size_t *.
+     *  
+     *  \returns a pointer to the size of the first dimension.
+     */
+    operator const ::size_t*() const { 
+        return (const ::size_t*) sizes_; 
+    }
+
+    //! \brief Queries the number of dimensions in the range.
+    ::size_t dimensions() const { return dimensions_; }
+};
+
+//! \brief A zero-dimensional range.
+static const NDRange NullRange;
+
+//! \brief Local address wrapper for use with Kernel::setArg
+struct LocalSpaceArg
+{
+    ::size_t size_;
+};
+
+namespace detail {
+
+template <typename T>
+struct KernelArgumentHandler
+{
+    static ::size_t size(const T&) { return sizeof(T); }
+    static T* ptr(T& value) { return &value; }
+};
+
+template <>
+struct KernelArgumentHandler<LocalSpaceArg>
+{
+    static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
+    static void* ptr(LocalSpaceArg&) { return NULL; }
+};
+
+} 
+//! \endcond
+
+/*! __local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ * Deprecated. Replaced with Local.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg
+__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline LocalSpaceArg
+__local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+/*! Local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ */
+inline LocalSpaceArg
+Local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+//class KernelFunctor;
+
+/*! \brief Class interface for cl_kernel.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_kernel as the original.  For details, see
+ *        clRetainKernel() and clReleaseKernel().
+ *
+ *  \see cl_kernel
+ */
+class Kernel : public detail::Wrapper<cl_kernel>
+{
+public:
+    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseKernel() on the value held by this instance.
+     */
+    ~Kernel() { }
+
+    //! \brief Default constructor - initializes to NULL.
+    Kernel() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainKernel() on the parameter's cl_kernel.
+     */
+    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+    /*! \brief Constructor from cl_kernel - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_kernel
+     *  into the new Kernel object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+    /*! \brief Assignment operator from Kernel.
+     * 
+     *  This calls clRetainKernel() on the parameter and clReleaseKernel()
+     *  on the previous value held by this instance.
+     */
+    Kernel& operator = (const Kernel& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_kernel - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseKernel() on the value previously held by this instance.
+     */
+    Kernel& operator = (const cl_kernel& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_kernel_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelInfo, object_, name, param),
+            __GET_KERNEL_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_2)
+    template <typename T>
+    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
+            __GET_KERNEL_ARG_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
+    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_arg_info, name>::param_type param;
+        cl_int result = getArgInfo(argIndex, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    template <typename T>
+    cl_int getWorkGroupInfo(
+        const Device& device, cl_kernel_work_group_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
+                __GET_KERNEL_WORK_GROUP_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+        detail::cl_kernel_work_group_info, name>::param_type param;
+        cl_int result = getWorkGroupInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int setArg(cl_uint index, T value)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(
+                object_,
+                index,
+                detail::KernelArgumentHandler<T>::size(value),
+                detail::KernelArgumentHandler<T>::ptr(value)),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    cl_int setArg(cl_uint index, ::size_t size, void* argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(object_, index, size, argPtr),
+            __SET_KERNEL_ARGS_ERR);
+    }
+};
+
+/*! \class Program
+ * \brief Program interface that implements cl_program.
+ */
+class Program : public detail::Wrapper<cl_program>
+{
+public:
+    typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
+    typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
+
+    Program(
+        const STRING_CLASS& source,
+		bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const STRING_CLASS& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const ::size_t n = (::size_t)sources.size();
+        ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
+        const char** strings = (const char**) alloca(n * sizeof(const char*));
+
+        for (::size_t i = 0; i < n; ++i) {
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings, lengths, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Construct a program object from a list of devices and a per-device list of binaries.
+     * \param context A valid OpenCL context in which to construct the program.
+     * \param devices A vector of OpenCL device objects for which the program will be created.
+     * \param binaries A vector of pairs of a pointer to a binary object and its length.
+     * \param binaryStatus An optional vector that on completion will be resized to
+     *   match the size of binaries and filled with values to specify if each binary
+     *   was successfully loaded.
+     *   Set to CL_SUCCESS if the binary was successfully loaded.
+     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
+     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+     *   CL_INVALID_CONTEXT if context is not a valid context.
+     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
+     *     or if any entry in binaries is NULL or has length 0.
+     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
+     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
+     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const Binaries& binaries,
+        VECTOR_CLASS<cl_int>* binaryStatus = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        
+        const ::size_t numDevices = devices.size();
+        
+        // Catch size mismatch early and return
+        if(binaries.size() != numDevices) {
+            error = CL_INVALID_VALUE;
+            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t));
+        const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**));
+
+        for (::size_t i = 0; i < numDevices; ++i) {
+            images[i] = (const unsigned char*)binaries[i].first;
+            lengths[i] = binaries[(int)i].second;
+        }
+
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        if(binaryStatus) {
+            binaryStatus->resize(numDevices);
+        }
+        
+        object_ = ::clCreateProgramWithBinary(
+            context(), (cl_uint) devices.size(),
+            deviceIDs,
+            lengths, images, binaryStatus != NULL
+               ? &binaryStatus->front()
+               : NULL, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    /**
+     * Create program using builtin kernels.
+     * \param kernelNames Semi-colon separated list of builtin kernel names
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const STRING_CLASS& kernelNames,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+        
+        object_ = ::clCreateProgramWithBuiltInKernels(
+            context(), 
+            (cl_uint) devices.size(),
+            deviceIDs,
+            kernelNames.c_str(), 
+            &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    Program() { }
+
+    Program(const Program& program) : detail::Wrapper<cl_type>(program) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { }
+
+    Program& operator = (const Program& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Program& operator = (const cl_program& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    cl_int build(
+        const VECTOR_CLASS<Device>& devices,
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                (cl_uint)
+                devices.size(),
+                deviceIDs,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+    cl_int build(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+#if defined(CL_VERSION_1_2)
+	cl_int compile(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clCompileProgram(
+                object_,
+                0,
+                NULL,
+                options,
+				0,
+				NULL,
+				NULL,
+                notifyFptr,
+                data),
+                __COMPILE_PROGRAM_ERR);
+    }
+#endif
+
+    template <typename T>
+    cl_int getInfo(cl_program_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int getBuildInfo(
+        const Device& device, cl_program_build_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetProgramBuildInfo, object_, device(), name, param),
+                __GET_PROGRAM_BUILD_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_build_info, name>::param_type
+    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_build_info, name>::param_type param;
+        cl_int result = getBuildInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
+    {
+        cl_uint numKernels;
+        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
+        err = ::clCreateKernelsInProgram(
+            object_, numKernels, (cl_kernel*) value, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        kernels->assign(&value[0], &value[numKernels]);
+        return CL_SUCCESS;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+inline Program linkProgram(
+    Program input1,
+    Program input2,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int err_local = CL_SUCCESS;
+
+    cl_program programs[2] = { input1(), input2() };
+
+    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>();
+
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        2,
+        programs,
+        notifyFptr,
+        data,
+        &err_local);
+
+    detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = err_local;
+    }
+
+    return Program(prog);
+}
+
+inline Program linkProgram(
+    VECTOR_CLASS<Program> inputPrograms,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int err_local = CL_SUCCESS;
+
+    cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program));
+
+    if (programs != NULL) {
+        for (unsigned int i = 0; i < inputPrograms.size(); i++) {
+          programs[i] = inputPrograms[i]();
+        }
+    } 
+
+    cl_program prog = ::clLinkProgram(
+        Context::getDefault()(),
+        0,
+        NULL,
+        options,
+        (cl_uint)inputPrograms.size(),
+        programs,
+        notifyFptr,
+        data,
+        &err_local);
+
+    detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = err_local;
+    }
+
+    return Program(prog);
+}
+#endif
+
+template<>
+inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
+{
+    VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
+    VECTOR_CLASS<char *> binaries;
+    for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) 
+    {
+        char *ptr = NULL;
+        if (*s != 0) 
+            ptr = new char[*s];
+        binaries.push_back(ptr);
+    }
+    
+    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries);
+    if (err != NULL) {
+        *err = result;
+    }
+    return binaries;
+}
+
+inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+{
+    cl_int error;
+
+    object_ = ::clCreateKernel(program(), name, &error);
+    detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+    if (err != NULL) {
+        *err = error;
+    }
+
+}
+
+/*! \class CommandQueue
+ * \brief CommandQueue interface for cl_command_queue.
+ */
+class CommandQueue : public detail::Wrapper<cl_command_queue>
+{
+private:
+    static volatile int default_initialized_;
+    static CommandQueue default_;
+    static volatile cl_int default_error_;
+public:
+   CommandQueue(
+        cl_command_queue_properties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            object_ = ::clCreateCommandQueue(
+                context(), device(), properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+    /*!
+    * \brief Constructs a CommandQueue for an implementation defined device in the given context
+    */
+    explicit CommandQueue(
+        const Context& context,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        VECTOR_CLASS<cl::Device> devices;
+        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS)
+        {
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        object_ = ::clCreateCommandQueue(context(), devices[0](), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateCommandQueue(
+            context(), device(), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    static CommandQueue getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            default_ = CommandQueue(context, device, 0, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    CommandQueue() { }
+
+    CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+    CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+    CommandQueue& operator = (const CommandQueue& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    CommandQueue& operator = (const cl_command_queue& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetCommandQueueInfo, object_, name, param),
+                __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_command_queue_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int enqueueReadBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBuffer(
+                object_, src(), dst(), src_offset, dst_offset, size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueReadBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_READ_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferRect(
+        const Buffer& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        ::size_t src_row_pitch,
+        ::size_t src_slice_pitch,
+        ::size_t dst_row_pitch,
+        ::size_t dst_slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferRect(
+                object_, 
+                src(), 
+                dst(), 
+                (const ::size_t *)src_origin, 
+                (const ::size_t *)dst_origin, 
+                (const ::size_t *)region,
+                src_row_pitch,
+                src_slice_pitch,
+                dst_row_pitch,
+                dst_slice_pitch,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill a buffer object with a pattern
+     * of a given size. The pattern is specified a as vector.
+     * \tparam PatternType The datatype of the pattern field. 
+     *     The pattern type must be an accepted OpenCL data type.
+     */
+    template<typename PatternType>
+    cl_int enqueueFillBuffer(
+        const Buffer& buffer,
+        PatternType pattern,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillBuffer(
+                object_, 
+                buffer(),
+                static_cast<void*>(&pattern),
+                sizeof(PatternType), 
+                offset, 
+                size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueReadImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_WRITE_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyImage(
+        const Image& src,
+        const Image& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImage(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *)dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA floating-point color value if
+     *     the image channel data type is not an unnormalized signed or
+     *     unsigned data type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_float4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA signed integer color value if
+     *     the image channel data type is an unnormalized signed integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_int4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA unsigned integer color value if
+     *     the image channel data type is an unnormalized unsigned integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_uint4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueCopyImageToBuffer(
+        const Image& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& region,
+        ::size_t dst_offset,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImageToBuffer(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *) region, dst_offset,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferToImage(
+        const Buffer& src,
+        const Image& dst,
+        ::size_t src_offset,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferToImage(
+                object_, src(), dst(), src_offset,
+                (const ::size_t *) dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_int error;
+        void * result = ::clEnqueueMapBuffer(
+            object_, buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+        return result;
+    }
+
+    void* enqueueMapImage(
+        const Image& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t * row_pitch,
+        ::size_t * slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_int error;
+        void * result = ::clEnqueueMapImage(
+            object_, buffer(), blocking, flags,
+            (const ::size_t *) origin, (const ::size_t *) region,
+            row_pitch, slice_pitch,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+        if (err != NULL) {
+              *err = error;
+        }
+        return result;
+    }
+
+    cl_int enqueueUnmapMemObject(
+        const Memory& memory,
+        void* mapped_ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueUnmapMemObject(
+                object_, memory(), mapped_ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or all previously enqueued commands to complete.
+     *
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command returns an event which can be waited on, 
+     * i.e. this event can be waited on to insure that all events either in the event_wait_list 
+     * or all previously enqueued commands, queued before this command to command_queue, 
+     * have completed.
+     */
+    cl_int enqueueMarkerWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarkerWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * A synchronization point that enqueues a barrier operation.
+     *
+     * Enqueues a barrier command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command blocks command execution, that is, any 
+     * following commands enqueued after it do not execute until it completes. This command 
+     * returns an event which can be waited on, i.e. this event can be waited on to insure that 
+     * all events either in the event_wait_list or all previously enqueued commands, queued 
+     * before this command to command_queue, have completed.
+     */
+    cl_int enqueueBarrierWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueBarrierWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_BARRIER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+    
+    /**
+     * Enqueues a command to indicate with which device a set of memory objects
+     * should be associated.
+     */
+    cl_int enqueueMigrateMemObjects(
+        const VECTOR_CLASS<Memory> &memObjects,
+        cl_mem_migration_flags flags,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL
+        )
+    {
+        cl_event tmp;
+        
+        cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem)));
+        for( int i = 0; i < (int)memObjects.size(); ++i ) {
+            localMemObjects[i] = memObjects[i]();
+        }
+
+
+        cl_int err = detail::errHandler(
+            ::clEnqueueMigrateMemObjects(
+                object_, 
+                (cl_uint)memObjects.size(), 
+                static_cast<const cl_mem*>(localMemObjects),
+                flags,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueNDRangeKernel(
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local = NullRange,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNDRangeKernel(
+                object_, kernel(), (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
+                (const ::size_t*) global,
+                local.dimensions() != 0 ? (const ::size_t*) local : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NDRANGE_KERNEL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueTask(
+        const Kernel& kernel,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueTask(
+                object_, kernel(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_TASK_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueNativeKernel(
+        void (CL_CALLBACK *userFptr)(void *),
+        std::pair<void*, ::size_t> args,
+        const VECTOR_CLASS<Memory>* mem_objects = NULL,
+        const VECTOR_CLASS<const void*>* mem_locs = NULL,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) 
+            ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
+            : NULL;
+
+        if (mems != NULL) {
+            for (unsigned int i = 0; i < mem_objects->size(); i++) {
+                mems[i] = ((*mem_objects)[i])();
+            }
+        }
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNativeKernel(
+                object_, userFptr, args.first, args.second,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                mems,
+                (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NATIVE_KERNEL);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
+    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueMarker(object_, (cl_event*) event),
+            __ENQUEUE_MARKER_ERR);
+    }
+
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueWaitForEvents(
+                object_,
+                (cl_uint) events.size(),
+                (const cl_event*) &events.front()),
+            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int enqueueAcquireGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueAcquireGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueReleaseGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+#if defined (USE_DX_INTEROP)
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+
+    cl_int enqueueAcquireD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+        
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             pfn_clEnqueueAcquireD3D10ObjectsKHR(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_2)
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_1)
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            pfn_clEnqueueReleaseD3D10ObjectsKHR(
+                object_,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueBarrier(object_),
+            __ENQUEUE_BARRIER_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int flush() const
+    {
+        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+    }
+
+    cl_int finish() const
+    {
+        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+    }
+};
+
+#ifdef _WIN32
+__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__declspec(selectany) CommandQueue CommandQueue::default_;
+__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#else
+__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__attribute__((weak)) CommandQueue CommandQueue::default_;
+__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#endif
+
+template< typename IteratorType >
+Buffer::Buffer(
+    const Context &context,
+    IteratorType startIterator,
+    IteratorType endIterator,
+    bool readOnly,
+    bool useHostPtr,
+    cl_int* err)
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+
+    cl_mem_flags flags = 0;
+    if( readOnly ) {
+        flags |= CL_MEM_READ_ONLY;
+    }
+    else {
+        flags |= CL_MEM_READ_WRITE;
+    }
+    if( useHostPtr ) {
+        flags |= CL_MEM_USE_HOST_PTR;
+    }
+    
+    ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+    if( useHostPtr ) {
+        object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+    } else {
+        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+    }
+
+    detail::errHandler(error, __CREATE_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    if( !useHostPtr ) {
+        CommandQueue queue(context, 0, &error);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        error = cl::copy(queue, startIterator, endIterator, *this);
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+}
+
+inline cl_int enqueueReadBuffer(
+    const Buffer& buffer,
+    cl_bool blocking,
+    ::size_t offset,
+    ::size_t size,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    void * result = ::clEnqueueMapBuffer(
+            queue(), buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+    return result;
+}
+
+inline cl_int enqueueUnmapMemObject(
+    const Memory& memory,
+    void* mapped_ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    cl_event tmp;
+    cl_int err = detail::errHandler(
+        ::clEnqueueUnmapMemObject(
+            queue(), memory(), mapped_ptr,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+    if (event != NULL && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+inline cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, startIterator, endIterator, buffer);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses default command queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    if (error != CL_SUCCESS)
+        return error;
+
+    return cl::copy(queue, buffer, startIterator, endIterator);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Host to Device.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+    
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+#if defined(_MSC_VER)
+    std::copy(
+        startIterator, 
+        endIterator, 
+        stdext::checked_array_iterator<DataType*>(
+            pointer, length));
+#else
+    std::copy(startIterator, endIterator, pointer);
+#endif
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ * Device to Host.
+ * Uses specified queue.
+ */
+template< typename IteratorType >
+inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+        
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+    std::copy(pointer, pointer + length, startIterator);
+    Event endEvent;
+    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+#if defined(CL_VERSION_1_1)
+inline cl_int enqueueReadBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyBufferRect(
+    const Buffer& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    ::size_t src_row_pitch,
+    ::size_t src_slice_pitch,
+    ::size_t dst_row_pitch,
+    ::size_t dst_slice_pitch,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferRect(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        src_row_pitch,
+        src_slice_pitch,
+        dst_row_pitch,
+        dst_slice_pitch,
+        events, 
+        event);
+}
+#endif
+
+inline cl_int enqueueReadImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL) 
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyImage(
+    const Image& src,
+    const Image& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImage(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyImageToBuffer(
+    const Image& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& region,
+    ::size_t dst_offset,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImageToBuffer(
+        src,
+        dst,
+        src_origin,
+        region,
+        dst_offset,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyBufferToImage(
+    const Buffer& src,
+    const Image& dst,
+    ::size_t src_offset,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferToImage(
+        src,
+        dst,
+        src_offset,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+
+inline cl_int flush(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.flush();
+}
+
+inline cl_int finish(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    } 
+
+
+    return queue.finish();
+}
+
+// Kernel Functor support
+// New interface as of September 2011
+// Requires the C++11 std::tr1::function (note do not support TR1)
+// Visual Studio 2010 and GCC 4.2
+
+struct EnqueueArgs
+{
+    CommandQueue queue_;
+    const NDRange offset_;
+    const NDRange global_;
+    const NDRange local_;
+    VECTOR_CLASS<Event> events_;
+
+    EnqueueArgs(NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(Event e, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+};
+
+namespace detail {
+
+class NullType {};
+
+template<int index, typename T0>
+struct SetArg
+{
+    static void set (Kernel kernel, T0 arg)
+    {
+        kernel.setArg(index, arg);
+    }
+};  
+
+template<int index>
+struct SetArg<index, NullType>
+{
+    static void set (Kernel, NullType)
+    { 
+    }
+};
+
+template <
+   typename T0,   typename T1,   typename T2,   typename T3,
+   typename T4,   typename T5,   typename T6,   typename T7,
+   typename T8,   typename T9,   typename T10,   typename T11,
+   typename T12,   typename T13,   typename T14,   typename T15,
+   typename T16,   typename T17,   typename T18,   typename T19,
+   typename T20,   typename T21,   typename T22,   typename T23,
+   typename T24,   typename T25,   typename T26,   typename T27,
+   typename T28,   typename T29,   typename T30,   typename T31
+>
+class KernelFunctorGlobal
+{
+private:
+    Kernel kernel_;
+
+public:
+   KernelFunctorGlobal(
+        Kernel kernel) :
+            kernel_(kernel)
+    {}
+
+   KernelFunctorGlobal(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+            kernel_(program, name.c_str(), err)
+    {}
+
+    Event operator() (
+        const EnqueueArgs& args,
+        T0 t0,
+        T1 t1 = NullType(),
+        T2 t2 = NullType(),
+        T3 t3 = NullType(),
+        T4 t4 = NullType(),
+        T5 t5 = NullType(),
+        T6 t6 = NullType(),
+        T7 t7 = NullType(),
+        T8 t8 = NullType(),
+        T9 t9 = NullType(),
+        T10 t10 = NullType(),
+        T11 t11 = NullType(),
+        T12 t12 = NullType(),
+        T13 t13 = NullType(),
+        T14 t14 = NullType(),
+        T15 t15 = NullType(),
+        T16 t16 = NullType(),
+        T17 t17 = NullType(),
+        T18 t18 = NullType(),
+        T19 t19 = NullType(),
+        T20 t20 = NullType(),
+        T21 t21 = NullType(),
+        T22 t22 = NullType(),
+        T23 t23 = NullType(),
+        T24 t24 = NullType(),
+        T25 t25 = NullType(),
+        T26 t26 = NullType(),
+        T27 t27 = NullType(),
+        T28 t28 = NullType(),
+        T29 t29 = NullType(),
+        T30 t30 = NullType(),
+        T31 t31 = NullType()
+        )
+    {
+        Event event;
+        SetArg<0, T0>::set(kernel_, t0);
+        SetArg<1, T1>::set(kernel_, t1);
+        SetArg<2, T2>::set(kernel_, t2);
+        SetArg<3, T3>::set(kernel_, t3);
+        SetArg<4, T4>::set(kernel_, t4);
+        SetArg<5, T5>::set(kernel_, t5);
+        SetArg<6, T6>::set(kernel_, t6);
+        SetArg<7, T7>::set(kernel_, t7);
+        SetArg<8, T8>::set(kernel_, t8);
+        SetArg<9, T9>::set(kernel_, t9);
+        SetArg<10, T10>::set(kernel_, t10);
+        SetArg<11, T11>::set(kernel_, t11);
+        SetArg<12, T12>::set(kernel_, t12);
+        SetArg<13, T13>::set(kernel_, t13);
+        SetArg<14, T14>::set(kernel_, t14);
+        SetArg<15, T15>::set(kernel_, t15);
+        SetArg<16, T16>::set(kernel_, t16);
+        SetArg<17, T17>::set(kernel_, t17);
+        SetArg<18, T18>::set(kernel_, t18);
+        SetArg<19, T19>::set(kernel_, t19);
+        SetArg<20, T20>::set(kernel_, t20);
+        SetArg<21, T21>::set(kernel_, t21);
+        SetArg<22, T22>::set(kernel_, t22);
+        SetArg<23, T23>::set(kernel_, t23);
+        SetArg<24, T24>::set(kernel_, t24);
+        SetArg<25, T25>::set(kernel_, t25);
+        SetArg<26, T26>::set(kernel_, t26);
+        SetArg<27, T27>::set(kernel_, t27);
+        SetArg<28, T28>::set(kernel_, t28);
+        SetArg<29, T29>::set(kernel_, t29);
+        SetArg<30, T30>::set(kernel_, t30);
+        SetArg<31, T31>::set(kernel_, t31);
+        
+        args.queue_.enqueueNDRangeKernel(
+            kernel_,
+            args.offset_,
+            args.global_,
+            args.local_,
+            &args.events_,
+            &event);
+        
+        return event;
+    }
+
+};
+
+//------------------------------------------------------------------------------------------------------
+
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30,
+	typename T31>
+struct functionImplementation_
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30,
+		T31 arg31)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30,
+			arg31);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	T30,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1>
+struct functionImplementation_
+<	T0,
+	T1,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1);
+	}
+
+
+};
+
+template<
+	typename T0>
+struct functionImplementation_
+<	T0,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0);
+	}
+
+
+};
+
+
+
+
+
+} // namespace detail
+
+//----------------------------------------------------------------------------------------------
+
+template <
+   typename T0,   typename T1 = detail::NullType,   typename T2 = detail::NullType,
+   typename T3 = detail::NullType,   typename T4 = detail::NullType,
+   typename T5 = detail::NullType,   typename T6 = detail::NullType,
+   typename T7 = detail::NullType,   typename T8 = detail::NullType,
+   typename T9 = detail::NullType,   typename T10 = detail::NullType,
+   typename T11 = detail::NullType,   typename T12 = detail::NullType,
+   typename T13 = detail::NullType,   typename T14 = detail::NullType,
+   typename T15 = detail::NullType,   typename T16 = detail::NullType,
+   typename T17 = detail::NullType,   typename T18 = detail::NullType,
+   typename T19 = detail::NullType,   typename T20 = detail::NullType,
+   typename T21 = detail::NullType,   typename T22 = detail::NullType,
+   typename T23 = detail::NullType,   typename T24 = detail::NullType,
+   typename T25 = detail::NullType,   typename T26 = detail::NullType,
+   typename T27 = detail::NullType,   typename T28 = detail::NullType,
+   typename T29 = detail::NullType,   typename T30 = detail::NullType,
+   typename T31 = detail::NullType
+>
+struct make_kernel :
+    public detail::functionImplementation_<
+               T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+    >
+{
+public:
+	typedef detail::KernelFunctorGlobal<             
+		       T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+    > FunctorType;
+
+    make_kernel(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+           >(
+            FunctorType(program, name, err)) 
+    {}
+
+    make_kernel(
+        const Kernel kernel) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+           >(
+            FunctorType(kernel)) 
+    {}    
+};
+
+
+//----------------------------------------------------------------------------------------------------------------------
+
+#undef __ERR_STR
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#undef __GET_DEVICE_INFO_ERR
+#undef __GET_PLATFORM_INFO_ERR
+#undef __GET_DEVICE_IDS_ERR
+#undef __GET_CONTEXT_INFO_ERR
+#undef __GET_EVENT_INFO_ERR
+#undef __GET_EVENT_PROFILE_INFO_ERR
+#undef __GET_MEM_OBJECT_INFO_ERR
+#undef __GET_IMAGE_INFO_ERR
+#undef __GET_SAMPLER_INFO_ERR
+#undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_ARG_INFO_ERR
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
+#undef __GET_PROGRAM_INFO_ERR
+#undef __GET_PROGRAM_BUILD_INFO_ERR
+#undef __GET_COMMAND_QUEUE_INFO_ERR
+
+#undef __CREATE_CONTEXT_ERR
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
+
+#undef __CREATE_BUFFER_ERR
+#undef __CREATE_SUBBUFFER_ERR
+#undef __CREATE_IMAGE2D_ERR
+#undef __CREATE_IMAGE3D_ERR
+#undef __CREATE_SAMPLER_ERR
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
+
+#undef __CREATE_USER_EVENT_ERR
+#undef __SET_USER_EVENT_STATUS_ERR
+#undef __SET_EVENT_CALLBACK_ERR
+#undef __SET_PRINTF_CALLBACK_ERR
+
+#undef __WAIT_FOR_EVENTS_ERR
+
+#undef __CREATE_KERNEL_ERR
+#undef __SET_KERNEL_ARGS_ERR
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
+#undef __BUILD_PROGRAM_ERR
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR
+
+#undef __CREATE_COMMAND_QUEUE_ERR
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
+#undef __ENQUEUE_READ_BUFFER_ERR
+#undef __ENQUEUE_WRITE_BUFFER_ERR
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
+#undef __ENQEUE_COPY_BUFFER_ERR
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR
+#undef __ENQUEUE_READ_IMAGE_ERR
+#undef __ENQUEUE_WRITE_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __ENQUEUE_MAP_BUFFER_ERR
+#undef __ENQUEUE_MAP_IMAGE_ERR
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR
+#undef __ENQUEUE_TASK_ERR
+#undef __ENQUEUE_NATIVE_KERNEL
+
+#undef __CL_EXPLICIT_CONSTRUCTORS
+
+#undef __UNLOAD_COMPILER_ERR
+#endif //__CL_USER_OVERRIDE_ERROR_STRINGS
+
+#undef __CL_FUNCTION_TYPE
+
+// Extensions
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_VERSION_1_1)
+#undef __INIT_CL_EXT_FCN_PTR
+#endif // #if defined(CL_VERSION_1_1)
+#undef __CREATE_SUB_DEVICES
+
+#if defined(USE_CL_DEVICE_FISSION)
+#undef __PARAM_NAME_DEVICE_FISSION
+#endif // USE_CL_DEVICE_FISSION
+
+#undef __DEFAULT_NOT_INITIALIZED 
+#undef __DEFAULT_BEING_INITIALIZED 
+#undef __DEFAULT_INITIALIZED
+
+} // namespace cl
+
+#ifdef _WIN32
+#pragma pop_macro("max")
+#endif // _WIN32
+
+#endif // CL_HPP_
diff --git a/include/CL/cl_d3d10.h b/include/CL/cl_d3d10.h
new file mode 100644
index 0000000..b6c90b3
--- /dev/null
+++ b/include/CL/cl_d3d10.h
@@ -0,0 +1,126 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D10_H
+#define __OPENCL_CL_D3D10_H
+
+#include <d3d10.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d10_sharing                                                       */
+#define cl_khr_d3d10_sharing 1
+
+typedef cl_uint cl_d3d10_device_source_khr;
+typedef cl_uint cl_d3d10_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
+
+/* cl_d3d10_device_source_nv */
+#define CL_D3D10_DEVICE_KHR                          0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
+
+/* cl_d3d10_device_set_nv */
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+
+/* cl_mem_info */
+#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
+
+/* cl_image_info */
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d10_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d10_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D10Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D10_H */
+
diff --git a/include/CL/cl_d3d11.h b/include/CL/cl_d3d11.h
new file mode 100644
index 0000000..2e0a63f
--- /dev/null
+++ b/include/CL/cl_d3d11.h
@@ -0,0 +1,126 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_D3D11_H
+#define __OPENCL_CL_D3D11_H
+
+#include <d3d11.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d11_sharing                                                       */
+#define cl_khr_d3d11_sharing 1
+
+typedef cl_uint cl_d3d11_device_source_khr;
+typedef cl_uint cl_d3d11_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
+
+/* cl_d3d11_device_source */
+#define CL_D3D11_DEVICE_KHR                          0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
+
+/* cl_d3d11_device_set */
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
+
+/* cl_mem_info */
+#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
+
+/* cl_image_info */
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d11_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d11_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D11Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D11_H */
+
diff --git a/include/CL/cl_dx9_media_sharing.h b/include/CL/cl_dx9_media_sharing.h
new file mode 100644
index 0000000..23f1631
--- /dev/null
+++ b/include/CL/cl_dx9_media_sharing.h
@@ -0,0 +1,127 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+/* cl_khr_dx9_media_sharing                                                   */
+#define cl_khr_dx9_media_sharing 1
+
+typedef cl_uint             cl_dx9_media_adapter_type_khr;
+typedef cl_uint             cl_dx9_media_adapter_set_khr;
+    
+#if defined(_WIN32)
+#include <d3d9.h>
+typedef struct _cl_dx9_surface_info_khr
+{
+    IDirect3DSurface9 *resource;
+    HANDLE shared_handle;
+} cl_dx9_surface_info_khr;
+#endif
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
+
+/* cl_media_adapter_type_khr */
+#define CL_ADAPTER_D3D9_KHR                              0x2020
+#define CL_ADAPTER_D3D9EX_KHR                            0x2021
+#define CL_ADAPTER_DXVA_KHR                              0x2022
+
+/* cl_media_adapter_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
+
+/* cl_context_info */
+#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
+
+/* cl_mem_info */
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+    cl_platform_id                   platform,
+    cl_uint                          num_media_adapters,
+    cl_dx9_media_adapter_type_khr *  media_adapter_type,
+    void *                           media_adapters,
+    cl_dx9_media_adapter_set_khr     media_adapter_set,
+    cl_uint                          num_entries,
+    cl_device_id *                   devices,
+    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
+    cl_context                    context,
+    cl_mem_flags                  flags,
+    cl_dx9_media_adapter_type_khr adapter_type,
+    void *                        surface_info,
+    cl_uint                       plane,                                                                          
+    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
+
diff --git a/include/CL/cl_egl.h b/include/CL/cl_egl.h
new file mode 100644
index 0000000..93e6c9c
--- /dev/null
+++ b/include/CL/cl_egl.h
@@ -0,0 +1,133 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2010 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+
+#ifdef __APPLE__
+
+#else
+#include <CL/cl.h>
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#endif  
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
+
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR             -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
+
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void* CLeglSyncKHR;
+
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+
+
+#define cl_khr_egl_image 1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context                  /* context */,
+                        CLeglDisplayKHR             /* egldisplay */,
+                        CLeglImageKHR               /* eglimage */,
+                        cl_mem_flags                /* flags */,
+                        const cl_egl_image_properties_khr * /* properties */,
+                        cl_int *                    /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+	cl_context                  context,
+	CLeglDisplayKHR             egldisplay,
+	CLeglImageKHR               eglimage,
+	cl_mem_flags                flags,
+	const cl_egl_image_properties_khr * properties,
+	cl_int *                    errcode_ret);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
+                              cl_uint          /* num_objects */,
+                              const cl_mem *   /* mem_objects */,
+                              cl_uint          /* num_events_in_wait_list */,
+                              const cl_event * /* event_wait_list */,
+                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+	cl_command_queue command_queue,
+	cl_uint          num_objects,
+	const cl_mem *   mem_objects,
+	cl_uint          num_events_in_wait_list,
+	const cl_event * event_wait_list,
+	cl_event *       event);
+
+
+#define cl_khr_egl_event 1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context      /* context */,
+                            CLeglSyncKHR    /* sync */,
+                            CLeglDisplayKHR /* display */,
+                            cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+	cl_context      context,
+	CLeglSyncKHR    sync,
+	CLeglDisplayKHR display,
+	cl_int *        errcode_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_EGL_H */
diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
new file mode 100644
index 0000000..710bea8
--- /dev/null
+++ b/include/CL/cl_ext.h
@@ -0,0 +1,316 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+        #include <OpenCL/cl.h>
+    #include <AvailabilityMacros.h>
+#else
+        #include <CL/cl.h>
+#endif
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources 
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in 
+ * which they were registered. The user callback functions are called and then the memory object is deleted 
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
+                                            const void * /* private_info */, 
+                                            size_t       /* cb */, 
+                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
+                                          const void * /* private_info */, 
+                                          size_t       /* cb */, 
+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************ 
+* cl_khr_icd extension *                                                  
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
+                       cl_platform_id * /* platforms */,
+                       cl_uint *        /* num_platforms */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
+    cl_uint          /* num_entries */,
+    cl_platform_id * /* platforms */,
+    cl_uint *        /* num_platforms */);
+
+
+/* Extension: cl_khr_image2D_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
+ * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
+ * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
+ * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
+ * for 2D images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the width,
+ * height, image format (i.e. channel order and channel data type) and optionally the row pitch
+ *
+ * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
+ * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
+ */
+    
+/*************************************
+ * cl_khr_initalize_memory extension *
+ *************************************/
+    
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x200E
+    
+    
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+    
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x200F
+#define CL_CONTEXT_TERMINATE_KHR                    0x2010
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
+    
+    
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a 
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+
+/*********************************
+* cl_arm_printf extension
+*********************************/
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+
+#ifdef CL_VERSION_1_1
+   /***********************************
+    * cl_ext_device_fission extension *
+    ***********************************/
+    #define cl_ext_device_fission   1
+    
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
+    
+    typedef CL_API_ENTRY cl_int 
+    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef cl_ulong  cl_device_partition_property_ext;
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
+                            const cl_device_partition_property_ext * /* properties */,
+                            cl_uint /*num_entries*/,
+                            cl_device_id * /*out_devices*/,
+                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    typedef CL_API_ENTRY cl_int 
+    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
+                                                const cl_device_partition_property_ext * /* properties */,
+                                                cl_uint /*num_entries*/,
+                                                cl_device_id * /*out_devices*/,
+                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+    /* cl_device_partition_property_ext */
+    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+    
+    /* clDeviceGetInfo selectors */
+    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+    
+    /* error codes */
+    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
+    #define CL_INVALID_PARTITION_NAME_EXT               -1059
+    
+    /* CL_AFFINITY_DOMAINs */
+    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+    
+    /* cl_device_partition_property_ext list terminators */
+    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0      
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+            
+	/* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* ION file descriptor */
+    int                  ion_filedesc;
+            
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
+
+} cl_mem_ion_host_ptr;
+
+#endif /* CL_VERSION_1_1 */
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* __CL_EXT_H */
diff --git a/include/CL/cl_gl.h b/include/CL/cl_gl.h
new file mode 100644
index 0000000..e52c1b6
--- /dev/null
+++ b/include/CL/cl_gl.h
@@ -0,0 +1,162 @@
+/**********************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif	
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#define CL_GL_NUM_SAMPLES                       0x2012
+
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     /* context */,
+                     cl_mem_flags   /* flags */,
+                     cl_GLuint      /* bufobj */,
+                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      /* context */,
+                      cl_mem_flags    /* flags */,
+                      cl_GLenum       /* target */,
+                      cl_GLint        /* miplevel */,
+                      cl_GLuint       /* texture */,
+                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+    
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   /* context */,
+                           cl_mem_flags /* flags */,
+                           cl_GLuint    /* renderbuffer */,
+                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                /* memobj */,
+                  cl_gl_object_type *   /* gl_object_type */,
+                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
+                  
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               /* memobj */,
+                   cl_gl_texture_info   /* param_name */,
+                   size_t               /* param_value_size */,
+                   void *               /* param_value */,
+                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
+                          cl_uint               /* num_objects */,
+                          const cl_mem *        /* mem_objects */,
+                          cl_uint               /* num_events_in_wait_list */,
+                          const cl_event *      /* event_wait_list */,
+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      /* context */,
+                        cl_mem_flags    /* flags */,
+                        cl_GLenum       /* target */,
+                        cl_GLint        /* miplevel */,
+                        cl_GLuint       /* texture */,
+                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+    
+/* cl_khr_gl_sharing extension  */
+    
+#define cl_khr_gl_sharing 1
+    
+typedef cl_uint     cl_gl_context_info;
+    
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+    
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+    
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+    
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
+                      cl_gl_context_info            /* param_name */,
+                      size_t                        /* param_value_size */,
+                      void *                        /* param_value */,
+                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+    
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_GL_H */
diff --git a/include/CL/cl_gl_ext.h b/include/CL/cl_gl_ext.h
new file mode 100644
index 0000000..77d5353
--- /dev/null
+++ b/include/CL/cl_gl_ext.h
@@ -0,0 +1,69 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
+/* OpenGL dependencies.                                                         */
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+    #include <OpenCL/cl_gl.h>
+#else
+    #include <CL/cl_gl.h>
+#endif
+
+/*
+ * For each extension, follow this template
+ *  cl_VEN_extname extension  */
+/* #define cl_VEN_extname 1
+ * ... define new types, if any
+ * ... define new tokens, if any
+ * ... define new APIs, if any
+ *
+ *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
+ *  This allows us to avoid having to decide whether to include GL headers or GLES here.
+ */
+
+/* 
+ *  cl_khr_gl_event  extension
+ *  See section 9.9 in the OpenCL 1.1 spec for more information
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context           /* context */,
+                           cl_GLsync            /* cl_GLsync */,
+                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* __OPENCL_CL_GL_EXT_H  */
diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h
new file mode 100644
index 0000000..f2fe9d4
--- /dev/null
+++ b/include/CL/cl_intel.h
@@ -0,0 +1,141 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __OPENCL_CL_INTEL_H
+#define __OPENCL_CL_INTEL_H
+
+#include "CL/cl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CL_MEM_PINNABLE (1 << 10)
+
+/* Track allocations and report current number of unfreed allocations */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReportUnfreedIntel(void);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clReportUnfreedIntel_fn)(void);
+
+/* 1 to 1 mapping of drm_intel_bo_map */
+extern CL_API_ENTRY void* CL_API_CALL
+clMapBufferIntel(cl_mem, cl_int*);
+
+typedef CL_API_ENTRY void* (CL_API_CALL *clMapBufferIntel_fn)(cl_mem, cl_int*);
+
+/* 1 to 1 mapping of drm_intel_bo_unmap */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnmapBufferIntel(cl_mem);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clUnmapBufferIntel_fn)(cl_mem);
+
+/* 1 to 1 mapping of drm_intel_gem_bo_map_gtt */
+extern CL_API_ENTRY void* CL_API_CALL
+clMapBufferGTTIntel(cl_mem, cl_int*);
+
+typedef CL_API_ENTRY void* (CL_API_CALL *clMapBufferGTTIntel_fn)(cl_mem, cl_int*);
+
+/* 1 to 1 mapping of drm_intel_gem_bo_unmap_gtt */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnmapBufferGTTIntel(cl_mem);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clUnmapBufferGTTIntel_fn)(cl_mem);
+
+/* Pin /Unpin the buffer in GPU memory (must be root) */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clPinBufferIntel(cl_mem);
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnpinBufferIntel(cl_mem);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clPinBufferIntel_fn)(cl_mem);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clUnpinBufferIntel_fn)(cl_mem);
+
+/* Get the generation of the Gen device (used to load the proper binary) */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGenVersionIntel(cl_device_id device, cl_int *ver);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGenVersionIntel_fn)(
+                             cl_device_id device,
+                             cl_int *ver);
+
+/* Create a program from a LLVM source file */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithLLVMIntel(cl_context              /* context */,
+                             cl_uint                 /* num_devices */,
+                             const cl_device_id *    /* device_list */,
+                             const char *            /* file */,
+                             cl_int *                /* errcode_ret */);
+
+typedef CL_API_ENTRY cl_program (CL_API_CALL *clCreateProgramWithLLVMIntel_fn)(
+                                 cl_context              /* context */,
+                                 cl_uint                 /* num_devices */,
+                                 const cl_device_id *    /* device_list */,
+                                 const char *            /* file */,
+                                 cl_int *                /* errcode_ret */);
+
+/* Create buffer from libva's buffer object */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBufferFromLibvaIntel(cl_context      /* context */,
+                             unsigned int    /* bo_name */,
+                             cl_int *        /* errcode_ret */);
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateBufferFromLibvaIntel_fn)(
+                             cl_context     /* context */,
+                             unsigned int   /* bo_name */,
+                             cl_int *       /* errcode_ret */);
+
+/* Create image from libva's buffer object */
+typedef struct _cl_libva_image {
+    unsigned int            bo_name;
+    uint32_t                offset;
+    uint32_t                width;
+    uint32_t                height;
+    cl_image_format         fmt;
+    uint32_t                row_pitch;
+    uint32_t                reserved[8];
+} cl_libva_image;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImageFromLibvaIntel(cl_context               /* context */,
+                            const cl_libva_image *   /* info */,
+                            cl_int *                 /* errcode_ret */);
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateImageFromLibvaIntel_fn)(
+                             cl_context             /* context */,
+                             const cl_libva_image * /* info */,
+                             cl_int *               /* errcode_ret */);
+
+/* Create buffer from libva's buffer object */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectFdIntel(cl_context   /* context */,
+                      cl_mem       /* Memory Obejct */,
+                      int*         /* returned fd */);
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetMemObjectFdIntel_fn)(
+                             cl_context   /* context */,
+                             cl_mem       /* Memory Obejct */,
+                             int*         /* returned fd */);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_INTEL_H */
+
diff --git a/include/CL/cl_platform.h b/include/CL/cl_platform.h
new file mode 100644
index 0000000..7f6f5e8
--- /dev/null
+++ b/include/CL/cl_platform.h
@@ -0,0 +1,1278 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+
+#ifdef __APPLE__
+    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
+    #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_API_SUFFIX__VERSION_1_1                  AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define GCL_API_SUFFIX__VERSION_1_1                 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_1                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED       CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+    
+    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+    #else
+        #warning  This path should never happen outside of internal operating system development.  AvailabilityMacros do not function correctly here!
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #endif
+#else
+    #define CL_EXTENSION_WEAK_LINK  
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+    #define CL_API_SUFFIX__VERSION_1_2
+    #define CL_EXT_SUFFIX__VERSION_1_2
+    
+    #ifdef __GNUC__
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #endif
+    
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #endif
+    #elif _WIN32
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED    
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)     
+        #endif
+    
+        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED    
+        #else
+            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED 
+            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)     
+        #endif
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+    
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    #endif
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      0x1.0p-23f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short    __attribute__((aligned(2)));
+typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
+typedef int32_t         cl_int      __attribute__((aligned(4)));
+typedef uint32_t        cl_uint     __attribute__((aligned(4)));
+typedef int64_t         cl_long     __attribute__((aligned(8)));
+typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
+
+typedef uint16_t        cl_half     __attribute__((aligned(2)));
+typedef float           cl_float    __attribute__((aligned(4)));
+typedef double          cl_double   __attribute__((aligned(8)));
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          0x1.fffffep127f
+#define CL_FLT_MIN          0x1.0p-126f
+#define CL_FLT_EPSILON      0x1.0p-23f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          0x1.fffffffffffffp1023
+#define CL_DBL_MIN          0x1.0p-1022
+#define CL_DBL_EPSILON      0x1.0p-52
+
+#define  CL_M_E             2.718281828459045090796
+#define  CL_M_LOG2E         1.442695040888963387005
+#define  CL_M_LOG10E        0.434294481903251816668
+#define  CL_M_LN2           0.693147180559945286227
+#define  CL_M_LN10          2.302585092994045901094
+#define  CL_M_PI            3.141592653589793115998
+#define  CL_M_PI_2          1.570796326794896557999
+#define  CL_M_PI_4          0.785398163397448278999
+#define  CL_M_1_PI          0.318309886183790691216
+#define  CL_M_2_PI          0.636619772367581382433
+#define  CL_M_2_SQRTPI      1.128379167095512558561
+#define  CL_M_SQRT2         1.414213562373095145475
+#define  CL_M_SQRT1_2       0.707106781186547572737
+
+#define  CL_M_E_F           2.71828174591064f
+#define  CL_M_LOG2E_F       1.44269502162933f
+#define  CL_M_LOG10E_F      0.43429449200630f
+#define  CL_M_LN2_F         0.69314718246460f
+#define  CL_M_LN10_F        2.30258512496948f
+#define  CL_M_PI_F          3.14159274101257f
+#define  CL_M_PI_2_F        1.57079637050629f
+#define  CL_M_PI_4_F        0.78539818525314f
+#define  CL_M_1_PI_F        0.31830987334251f
+#define  CL_M_2_PI_F        0.63661974668503f
+#define  CL_M_2_SQRTPI_F    1.12837922573090f
+#define  CL_M_SQRT2_F       1.41421353816986f
+#define  CL_M_SQRT1_2_F     0.70710676908493f
+
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+#else
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )  
+#endif
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types 
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned. 
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte 
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned. 
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+   typedef vector unsigned char     __cl_uchar16;
+   typedef vector signed char       __cl_char16;
+   typedef vector unsigned short    __cl_ushort8;
+   typedef vector signed short      __cl_short8;
+   typedef vector unsigned int      __cl_uint4;
+   typedef vector signed int        __cl_int4;
+   typedef vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#endif
+
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#endif
+
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h> 
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+#endif
+
+/* Define capabilities for anonymous struct members. */
+#if defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && (_MSC_VER >= 1500)
+   /* Microsoft Developer Studio 2008 supports anonymous structs, but
+    * complains by default. */
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless
+    * struct/union */
+#pragma warning( push )
+#pragma warning( disable : 4201 )
+#else
+#define  __CL_HAS_ANON_STRUCT__ 0
+#define  __CL_ANON_STRUCT__
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+#else
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if __CL_HAS_ANON_STRUCT__
+    /* .xyzw and .s0123...{f|F} are supported */
+    #define CL_HAS_NAMED_VECTOR_FIELDS 1
+    /* .hi and .lo are supported */
+    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+    cl_char  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[2];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+
+typedef union
+{
+    cl_char   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[4];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__) 
+    __cl_char2     v2[8];
+#endif
+#if defined( __CL_CHAR4__) 
+    __cl_char4     v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+    cl_uchar  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
+#endif
+#if defined( __cl_uchar2__) 
+    __cl_uchar2     v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[2];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+
+typedef union
+{
+    cl_uchar   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[4];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__) 
+    __cl_uchar2     v2[8];
+#endif
+#if defined( __CL_UCHAR4__) 
+    __cl_uchar4     v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+    cl_short  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[2];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+
+typedef union
+{
+    cl_short   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[4];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__) 
+    __cl_short2     v2[8];
+#endif
+#if defined( __CL_SHORT4__) 
+    __cl_short4     v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+    cl_ushort  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[2];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+
+typedef union
+{
+    cl_ushort   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[4];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__) 
+    __cl_ushort2     v2[8];
+#endif
+#if defined( __CL_USHORT4__) 
+    __cl_ushort4     v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+#endif
+}cl_ushort16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+    cl_int  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[2];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+
+typedef union
+{
+    cl_int   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[4];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[2];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__) 
+    __cl_int2     v2[8];
+#endif
+#if defined( __CL_INT4__) 
+    __cl_int4     v4[4];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#endif
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+    cl_uint  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[2];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+
+typedef union
+{
+    cl_uint   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[4];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__) 
+    __cl_uint2     v2[8];
+#endif
+#if defined( __CL_UINT4__) 
+    __cl_uint4     v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+    cl_long  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[2];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+
+typedef union
+{
+    cl_long   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[4];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__) 
+    __cl_long2     v2[8];
+#endif
+#if defined( __CL_LONG4__) 
+    __cl_long4     v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+    cl_ulong  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[2];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+
+typedef union
+{
+    cl_ulong   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[4];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__) 
+    __cl_ulong2     v2[8];
+#endif
+#if defined( __CL_ULONG4__) 
+    __cl_ulong4     v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+    cl_float  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[2];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+
+typedef union
+{
+    cl_float   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[4];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__) 
+    __cl_float2     v2[8];
+#endif
+#if defined( __CL_FLOAT4__) 
+    __cl_float4     v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+    cl_double  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[2];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+
+typedef union
+{
+    cl_double   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[4];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__) 
+    __cl_double2     v2[8];
+#endif
+#if defined( __CL_DOUBLE4__) 
+    __cl_double4     v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging 
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. 
+ *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source 
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" 
+  
+#ifdef __cplusplus
+}
+#endif
+
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && (_MSC_VER >= 1500)
+#pragma warning( pop )
+#endif
+
+#endif  /* __CL_PLATFORM_H  */
diff --git a/include/CL/opencl.h b/include/CL/opencl.h
new file mode 100644
index 0000000..3f00524
--- /dev/null
+++ b/include/CL/opencl.h
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+
+#else
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
+
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
new file mode 100644
index 0000000..4d0bed7
--- /dev/null
+++ b/include/CMakeLists.txt
@@ -0,0 +1,5 @@
+FILE(GLOB HEADER_FILES "CL/*.h")
+FILE(GLOB HPP_FILES "CL/*.hpp")
+
+install (FILES ${HEADER_FILES} DESTINATION include/CL)
+install (FILES ${HPP_FILES} DESTINATION include/CL)
diff --git a/intel-beignet.icd.in b/intel-beignet.icd.in
new file mode 100644
index 0000000..9b2e349
--- /dev/null
+++ b/intel-beignet.icd.in
@@ -0,0 +1 @@
+ at LIB_INSTALL_DIR@/beignet/libcl.so
diff --git a/kernels/buildin_work_dim.cl b/kernels/buildin_work_dim.cl
new file mode 100644
index 0000000..27c0e18
--- /dev/null
+++ b/kernels/buildin_work_dim.cl
@@ -0,0 +1,3 @@
+kernel void buildin_work_dim( __global int *ret ) {
+  *ret = get_work_dim();
+}
diff --git a/kernels/builtin_acos_asin.cl b/kernels/builtin_acos_asin.cl
new file mode 100644
index 0000000..bba2d21
--- /dev/null
+++ b/kernels/builtin_acos_asin.cl
@@ -0,0 +1,10 @@
+__kernel void builtin_acos_asin(__global float *dst, __global float *src, __global int *max_func) {
+  int i = get_global_id(0);
+  float x = src[i];
+
+  dst[i * (*max_func) + 0] = acos(x);
+  dst[i * (*max_func) + 1] = acosh(x);
+  dst[i * (*max_func) + 2] = asin(x);
+  dst[i * (*max_func) + 3] = asinh(x);
+  dst[i * (*max_func) + 4] = x;
+};
diff --git a/kernels/builtin_atan2.cl b/kernels/builtin_atan2.cl
new file mode 100644
index 0000000..aba73be
--- /dev/null
+++ b/kernels/builtin_atan2.cl
@@ -0,0 +1,4 @@
+kernel void builtin_atan2(global float *y, global float *x, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = atan2(y[i], x[i]);
+};
diff --git a/kernels/builtin_bitselect.cl b/kernels/builtin_bitselect.cl
new file mode 100644
index 0000000..9b60cbe
--- /dev/null
+++ b/kernels/builtin_bitselect.cl
@@ -0,0 +1,4 @@
+kernel void builtin_bitselect(global float *src1, global float *src2, global float *src3, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = bitselect(src1[i], src2[i], src3[i]);
+}
diff --git a/kernels/builtin_convert_sat.cl b/kernels/builtin_convert_sat.cl
new file mode 100644
index 0000000..1485f1d
--- /dev/null
+++ b/kernels/builtin_convert_sat.cl
@@ -0,0 +1,48 @@
+#define DEF(DSTTYPE, SRCTYPE) \
+  kernel void builtin_convert_ ## SRCTYPE ## _to_ ## DSTTYPE ## _sat(global SRCTYPE *src, global DSTTYPE *dst) { \
+  int i = get_global_id(0); \
+  dst[i] = convert_ ## DSTTYPE ## _sat(src[i]); \
+}
+
+DEF(char, uchar);
+DEF(char, short);
+DEF(char, ushort);
+DEF(char, int);
+DEF(char, uint);
+DEF(char, long);
+DEF(char, ulong);
+DEF(char, float);
+DEF(uchar, char);
+DEF(uchar, short);
+DEF(uchar, ushort);
+DEF(uchar, int);
+DEF(uchar, uint);
+DEF(uchar, long);
+DEF(uchar, ulong);
+DEF(uchar, float);
+DEF(short, ushort);
+DEF(short, int);
+DEF(short, uint);
+DEF(short, long);
+DEF(short, ulong);
+DEF(short, float);
+DEF(ushort, short);
+DEF(ushort, int);
+DEF(ushort, uint);
+DEF(ushort, long);
+DEF(ushort, ulong);
+DEF(ushort, float);
+DEF(int, uint);
+DEF(int, long);
+DEF(int, ulong);
+DEF(int, float);
+DEF(uint, int);
+DEF(uint, long);
+DEF(uint, ulong);
+DEF(uint, float);
+DEF(long, ulong);
+DEF(long, float);
+DEF(ulong, long);
+DEF(ulong, float);
+#undef DEF
+
diff --git a/kernels/builtin_exp.cl b/kernels/builtin_exp.cl
new file mode 100644
index 0000000..ecc1a3e
--- /dev/null
+++ b/kernels/builtin_exp.cl
@@ -0,0 +1,10 @@
+__kernel void builtin_exp(__global float *dst, __global float *src, __global int *max_func) {
+  int i = get_global_id(0);
+  float x = src[i];
+
+  dst[i * (*max_func) + 0] = exp(x);
+  dst[i * (*max_func) + 1] = exp2(x);
+  dst[i * (*max_func) + 2] = exp10(x);
+  dst[i * (*max_func) + 3] = expm1(x);
+  dst[i * (*max_func) + 4] = x;
+};
diff --git a/kernels/builtin_frexp.cl b/kernels/builtin_frexp.cl
new file mode 100644
index 0000000..766695a
--- /dev/null
+++ b/kernels/builtin_frexp.cl
@@ -0,0 +1,4 @@
+kernel void builtin_frexp(global float *src, global float *dst, global int *e) {
+  int i = get_global_id(0);
+  dst[i] = frexp(src[i], &e[i]);
+}
diff --git a/kernels/builtin_global_id.cl b/kernels/builtin_global_id.cl
new file mode 100644
index 0000000..5b82f9f
--- /dev/null
+++ b/kernels/builtin_global_id.cl
@@ -0,0 +1,4 @@
+kernel void builtin_global_id( __global int *ret) {
+  int id = get_global_id(0) + get_global_id(1)*3 + get_global_id(2)*3*4;
+  ret[id] = id;
+}
diff --git a/kernels/builtin_global_size.cl b/kernels/builtin_global_size.cl
new file mode 100644
index 0000000..e6ddb2f
--- /dev/null
+++ b/kernels/builtin_global_size.cl
@@ -0,0 +1,3 @@
+kernel void builtin_global_size( __global int *ret, __global int *i_dim ) {
+  *ret = get_global_size( *i_dim);
+}
diff --git a/kernels/builtin_lgamma.cl b/kernels/builtin_lgamma.cl
new file mode 100644
index 0000000..85bf859
--- /dev/null
+++ b/kernels/builtin_lgamma.cl
@@ -0,0 +1,4 @@
+kernel void builtin_lgamma(global float *src, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = lgamma(src[i]);
+};
diff --git a/kernels/builtin_lgamma_r.cl b/kernels/builtin_lgamma_r.cl
new file mode 100644
index 0000000..71fcc36
--- /dev/null
+++ b/kernels/builtin_lgamma_r.cl
@@ -0,0 +1,4 @@
+kernel void builtin_lgamma_r(global float *src, global float *dst, global int *signp) {
+  int i = get_global_id(0);
+  dst[i] = lgamma_r(src[i], signp+i);
+};
diff --git a/kernels/builtin_local_id.cl b/kernels/builtin_local_id.cl
new file mode 100644
index 0000000..489833d
--- /dev/null
+++ b/kernels/builtin_local_id.cl
@@ -0,0 +1,6 @@
+kernel void builtin_local_id( __global int *ret) {
+  int id = get_local_id(0) +  get_group_id(0) * 2 + \
+           get_local_id(1) * 4 + get_group_id(1) * 12 +\
+           get_local_id(2) * 36 + get_group_id(2) * 144;
+  ret[id] = id;
+}
diff --git a/kernels/builtin_local_size.cl b/kernels/builtin_local_size.cl
new file mode 100644
index 0000000..979d907
--- /dev/null
+++ b/kernels/builtin_local_size.cl
@@ -0,0 +1,3 @@
+kernel void builtin_local_size( __global int *ret, __global int *i_dim ) {
+  *ret = get_local_size( *i_dim);
+}
diff --git a/kernels/builtin_mad_sat.cl b/kernels/builtin_mad_sat.cl
new file mode 100644
index 0000000..1739a4d
--- /dev/null
+++ b/kernels/builtin_mad_sat.cl
@@ -0,0 +1,4 @@
+kernel void builtin_mad_sat(global short *src1, global short *src2, global short *src3, global short *dst) {
+  short i = get_global_id(0);
+  dst[i] = mad_sat(src1[i], src2[i], src3[i]);
+}
diff --git a/kernels/builtin_modf.cl b/kernels/builtin_modf.cl
new file mode 100644
index 0000000..43630ed
--- /dev/null
+++ b/kernels/builtin_modf.cl
@@ -0,0 +1,6 @@
+kernel void builtin_modf(global float *src, global float *dst, global float *it) {
+  int i = get_global_id(0);
+  float x;
+  dst[i] = modf(src[i], &x);
+  it[i] = x;
+}
diff --git a/kernels/builtin_nextafter.cl b/kernels/builtin_nextafter.cl
new file mode 100644
index 0000000..3945e34
--- /dev/null
+++ b/kernels/builtin_nextafter.cl
@@ -0,0 +1,4 @@
+kernel void builtin_nextafter(global float *src1, global float *src2, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = nextafter(src1[i], src2[i]);
+}
diff --git a/kernels/builtin_num_groups.cl b/kernels/builtin_num_groups.cl
new file mode 100644
index 0000000..719d25d
--- /dev/null
+++ b/kernels/builtin_num_groups.cl
@@ -0,0 +1,3 @@
+kernel void builtin_num_groups( __global int *ret, __global int *i_dim ) {
+  *ret = get_num_groups( *i_dim);
+}
diff --git a/kernels/builtin_pow.cl b/kernels/builtin_pow.cl
new file mode 100644
index 0000000..17d753e
--- /dev/null
+++ b/kernels/builtin_pow.cl
@@ -0,0 +1,7 @@
+kernel void builtin_pow(global float *dst, global float *src1, global float *src2, global int *max_func) {
+
+  int i = get_global_id(0);
+  dst[i * (*max_func) + 0] = pow(src1[i], src2[i]);
+  dst[i * (*max_func) + 1] = src1[i];
+
+}
diff --git a/kernels/builtin_remquo.cl b/kernels/builtin_remquo.cl
new file mode 100644
index 0000000..d66c164
--- /dev/null
+++ b/kernels/builtin_remquo.cl
@@ -0,0 +1,6 @@
+kernel void builtin_remquo(global float *x, global float *y, global float *dst, global int *quo) {
+  int i = get_global_id(0);
+  int q;
+  dst[i] = remquo(x[i], y[i], & q);
+  quo[i] = q;
+}
diff --git a/kernels/builtin_shuffle.cl b/kernels/builtin_shuffle.cl
new file mode 100644
index 0000000..ad988b9
--- /dev/null
+++ b/kernels/builtin_shuffle.cl
@@ -0,0 +1,8 @@
+kernel void builtin_shuffle(global float *src1, global float *src2, global float *dst1, global float *dst2) {
+  int i = get_global_id(0);
+  float2 src = (float2)(src1[i], src2[i]);
+  uint2 mask = (uint2)(1, 0);
+  float2 dst = shuffle(src, mask);
+  dst1[i] = dst.s0;
+  dst2[i] = dst.s1;
+}
diff --git a/kernels/builtin_shuffle2.cl b/kernels/builtin_shuffle2.cl
new file mode 100644
index 0000000..1a122d4
--- /dev/null
+++ b/kernels/builtin_shuffle2.cl
@@ -0,0 +1,13 @@
+kernel void builtin_shuffle2(global float *src1, global float *src2, global float *dst1, global float *dst2) {
+  int i = get_global_id(0);
+  float2 x = (float2)(src1[i], src2[i]);
+  float2 y = (float2)(1234, 5678);
+  uint4 mask = (uint4)(1, 0, 0, 0);
+  float4 v1 = shuffle2(x, y, mask);
+  float16 x2 = 0;
+  float16 y2 = (float16)(src1[i], src2[i], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  uint16 mask2 = (uint16)(17, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  float16 v2 = shuffle2(x2, y2, mask2);
+  dst1[i] = v1.s0 + v2.s0;
+  dst2[i] = v1.s1 + v2.s1;
+}
diff --git a/kernels/builtin_sign.cl b/kernels/builtin_sign.cl
new file mode 100644
index 0000000..ff9a66b
--- /dev/null
+++ b/kernels/builtin_sign.cl
@@ -0,0 +1,4 @@
+kernel void builtin_sign(global float *src, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = sign(src[i]);
+}
diff --git a/kernels/builtin_sinpi.cl b/kernels/builtin_sinpi.cl
new file mode 100644
index 0000000..134152d
--- /dev/null
+++ b/kernels/builtin_sinpi.cl
@@ -0,0 +1,4 @@
+kernel void builtin_sinpi(global float *src, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = sinpi(src[i]);
+};
diff --git a/kernels/builtin_tgamma.cl b/kernels/builtin_tgamma.cl
new file mode 100644
index 0000000..1f7abc3
--- /dev/null
+++ b/kernels/builtin_tgamma.cl
@@ -0,0 +1,4 @@
+kernel void builtin_tgamma(global float *src, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = tgamma(src[i]);
+};
diff --git a/kernels/compare_image_2d_and_1d_array.cl b/kernels/compare_image_2d_and_1d_array.cl
new file mode 100644
index 0000000..6aabb43
--- /dev/null
+++ b/kernels/compare_image_2d_and_1d_array.cl
@@ -0,0 +1,13 @@
+__kernel void
+compare_image_2d_and_1d_array(image2d_t a1, image1d_array_t a2, sampler_t sampler)
+{
+  float2 coord;
+  int4 color1;
+  int4 color2;
+  coord.x = (float)get_global_id(0) + 0.3f;
+  coord.y = (float)get_global_id(1) + 0.3f;
+  color1 = read_imagei(a1, sampler, coord);
+  color2 = read_imagei(a2, sampler, coord);
+//  printf("########## x y is (%f, %f), color1 is (%d %d %d %d), color2 is (%d %d %d %d)\n",
+//	  coord.x, coord.y, color1.x, color1.y, color1.z, color1.w, color2.x, color2.y, color2.z, color2.w);
+}
diff --git a/kernels/compiler_abs.cl b/kernels/compiler_abs.cl
new file mode 100644
index 0000000..549575c
--- /dev/null
+++ b/kernels/compiler_abs.cl
@@ -0,0 +1,28 @@
+#define COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
+    kernel void compiler_abs_##TYPE ( \
+           global TYPE* src, global UTYPE* dst) { \
+        int i = get_global_id(0); \
+        dst[i] = abs(src[i]);     \
+    }
+
+#define COMPILER_ABS_FUNC_N(TYPE, UTYPE, N) \
+    kernel void compiler_abs_##TYPE##N ( \
+           global TYPE##N* src, global UTYPE##N* dst) { \
+        int i = get_global_id(0); \
+        dst[i] = abs(src[i]);     \
+    }
+
+#define COMPILER_ABS(TYPE, UTYPE)  \
+    COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 2) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 3) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 4) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 8) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 16)
+
+COMPILER_ABS(int, uint)
+COMPILER_ABS(uint, uint)
+COMPILER_ABS(char, uchar)
+COMPILER_ABS(uchar, uchar)
+COMPILER_ABS(short, ushort)
+COMPILER_ABS(ushort, ushort)
diff --git a/kernels/compiler_abs_diff.cl b/kernels/compiler_abs_diff.cl
new file mode 100644
index 0000000..1f30df4
--- /dev/null
+++ b/kernels/compiler_abs_diff.cl
@@ -0,0 +1,30 @@
+#define COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
+    kernel void compiler_abs_diff_##TYPE ( \
+           global TYPE* x, global TYPE* y, global UTYPE* diff) { \
+        int i = get_global_id(0); \
+        diff[i] = abs_diff(x[i], y[i]);     \
+    }
+
+#define COMPILER_ABS_FUNC_N(TYPE, UTYPE, N) \
+    kernel void compiler_abs_diff_##TYPE##N ( \
+           global TYPE##N* x, global TYPE##N* y, global UTYPE##N* diff) { \
+        int i = get_global_id(0); \
+        diff[i] = abs_diff(x[i], y[i]);     \
+    }
+
+#define COMPILER_ABS(TYPE, UTYPE)  \
+    COMPILER_ABS_FUNC_1(TYPE, UTYPE) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 2) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 3) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 4) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 8) \
+    COMPILER_ABS_FUNC_N(TYPE, UTYPE, 16)
+
+COMPILER_ABS(int, uint)
+COMPILER_ABS(uint, uint)
+COMPILER_ABS(char, uchar)
+COMPILER_ABS(uchar, uchar)
+COMPILER_ABS(short, ushort)
+COMPILER_ABS(ushort, ushort)
+COMPILER_ABS(long, ulong)
+COMPILER_ABS(ulong, ulong)
diff --git a/kernels/compiler_address_space.cl b/kernels/compiler_address_space.cl
new file mode 100644
index 0000000..68b7746
--- /dev/null
+++ b/kernels/compiler_address_space.cl
@@ -0,0 +1,9 @@
+/* test OpenCL 1.1 Address Space Qualifiers (section 6.5) */
+__constant float cf1[] = {1, 2, 3};
+constant float cf2[] = {4, 5, 6};
+__kernel void compiler_address_space(__global float *gf1, global float *gf2) {
+  __local float lf1[4];
+  local float lf2[4];
+  __private float pf1[4];
+  private float pf2[4];
+}
diff --git a/kernels/compiler_argument_structure.cl b/kernels/compiler_argument_structure.cl
new file mode 100644
index 0000000..ab7896e
--- /dev/null
+++ b/kernels/compiler_argument_structure.cl
@@ -0,0 +1,9 @@
+struct hop { int x, y; };
+
+__kernel void
+compiler_argument_structure(__global int *dst, struct hop h)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = h.x + h.y;
+}
+
diff --git a/kernels/compiler_argument_structure_indirect.cl b/kernels/compiler_argument_structure_indirect.cl
new file mode 100644
index 0000000..c4b062f
--- /dev/null
+++ b/kernels/compiler_argument_structure_indirect.cl
@@ -0,0 +1,9 @@
+struct hop { int x[16]; };
+
+__kernel void
+compiler_argument_structure(__global int *dst, struct hop h)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = h.x[get_local_id(0)];
+}
+
diff --git a/kernels/compiler_arith_shift_right.cl b/kernels/compiler_arith_shift_right.cl
new file mode 100644
index 0000000..03a4d8d
--- /dev/null
+++ b/kernels/compiler_arith_shift_right.cl
@@ -0,0 +1,4 @@
+kernel void compiler_arith_shift_right(global int *src, global int *dst) {
+    int i = get_global_id(0);
+    dst[i] = src[i] >> 24;
+}
diff --git a/kernels/compiler_array.cl b/kernels/compiler_array.cl
new file mode 100644
index 0000000..5dce4d9
--- /dev/null
+++ b/kernels/compiler_array.cl
@@ -0,0 +1,14 @@
+__kernel void
+compiler_array(__global int *src, __global int *dst)
+{
+  int array[16];
+  int i;
+  for (i = 0; i < 16; ++i) {
+    if (src[0] > 10)
+      array[i] = get_local_id(0);
+    else
+      array[15 - i] = 3 + get_local_id(1);
+  }
+  dst[get_global_id(0)] = array[get_local_id(0)];
+}
+
diff --git a/kernels/compiler_array0.cl b/kernels/compiler_array0.cl
new file mode 100644
index 0000000..3ab0fb8
--- /dev/null
+++ b/kernels/compiler_array0.cl
@@ -0,0 +1,16 @@
+__kernel void
+compiler_array0(__global int *src, __global int *dst)
+{
+  int i;
+  int final[16];
+  for (i = 0; i < 16; ++i) {
+    int array[16], j;
+    for (j = 0; j < 16; ++j)
+      array[j] = get_global_id(0);
+    for (j = 0; j < src[0]; ++j)
+      array[j] = 1+src[j];
+    final[i] = array[i];
+  }
+  dst[get_global_id(0)] = final[get_global_id(0)];
+}
+
diff --git a/kernels/compiler_array1.cl b/kernels/compiler_array1.cl
new file mode 100644
index 0000000..ad567c2
--- /dev/null
+++ b/kernels/compiler_array1.cl
@@ -0,0 +1,15 @@
+__kernel void
+compiler_array1(__global int *src, __global int *dst)
+{
+  int final[16];
+  for (int i = 0; i < 16; ++i) {
+    int array[16];
+    for (int j = 0; j < src[0]; ++j)
+      array[j] = 1+src[0];
+    for (int j = src[0]; j < 16; ++j)
+      array[j] = get_global_id(0);
+    final[i] = array[i];
+  }
+  dst[get_global_id(0)] = final[get_global_id(0)];
+}
+
diff --git a/kernels/compiler_array2.cl b/kernels/compiler_array2.cl
new file mode 100644
index 0000000..ae73932
--- /dev/null
+++ b/kernels/compiler_array2.cl
@@ -0,0 +1,13 @@
+__kernel void
+compiler_array2(__global int *src, __global int *dst)
+{
+  int final[16];
+  int array[16];
+  for (int j = 0; j < 16; ++j) array[j] = j;
+  for (int j = 0; j < 16; ++j) final[j] = j+1;
+  if (get_global_id(0) == 15)
+    dst[get_global_id(0)] = final[get_global_id(0)];
+  else
+    dst[get_global_id(0)] = array[15 - get_global_id(0)];
+}
+
diff --git a/kernels/compiler_array3.cl b/kernels/compiler_array3.cl
new file mode 100644
index 0000000..152c22a
--- /dev/null
+++ b/kernels/compiler_array3.cl
@@ -0,0 +1,14 @@
+__kernel void
+compiler_array3(__global int *src, __global int *dst)
+{
+  int tmp[32];
+  for (int i = 0; i < 16; ++i) {
+    for (int j = 0; j < 16; ++j)
+      tmp[j] = get_global_id(0);
+    for (int j = 0; j < src[0]; ++j)
+      tmp[j] = 1+src[j];
+    tmp[16+i] = tmp[i];
+  }
+  dst[get_global_id(0)] = tmp[16+get_global_id(0)];
+}
+
diff --git a/kernels/compiler_async_copy.cl b/kernels/compiler_async_copy.cl
new file mode 100644
index 0000000..dddde44
--- /dev/null
+++ b/kernels/compiler_async_copy.cl
@@ -0,0 +1,24 @@
+#define DEF(TYPE) \
+kernel void \
+compiler_async_copy_##TYPE(__global TYPE *dst, __global TYPE *src, __local TYPE *localBuffer, int copiesPerWorkItem) \
+{ \
+  event_t event; \
+  int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0); \
+  int i; \
+  event = async_work_group_copy((__local TYPE*)localBuffer, (__global const TYPE*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, (event_t)0 ); \
+  wait_group_events( 1, &event ); \
+\
+  event = async_work_group_copy((__global TYPE*)(dst+copiesPerWorkgroup*get_group_id(0)), (__local const TYPE*)localBuffer, (size_t)copiesPerWorkgroup, (event_t)0 ); \
+  wait_group_events( 1, &event ); \
+}
+
+DEF(char2);
+DEF(uchar2);
+DEF(short2);
+DEF(ushort2);
+DEF(int2);
+DEF(uint2);
+DEF(long2);
+DEF(ulong2);
+DEF(float2);
+//DEF(double2);
diff --git a/kernels/compiler_async_copy_and_prefetch.cl b/kernels/compiler_async_copy_and_prefetch.cl
new file mode 100644
index 0000000..7489bb0
--- /dev/null
+++ b/kernels/compiler_async_copy_and_prefetch.cl
@@ -0,0 +1,9 @@
+/* test OpenCL 1.1 Async Copies and Prefetch Functions (section 6.11.10) */
+kernel void compiler_async_copy_and_prefetch(__global float *p) {
+  prefetch(p, 10);
+  local float l[10];
+  event_t e[2];
+  async_work_group_copy(l, p, 10, 0);
+  async_work_group_copy(p, l, 10, 0);
+  wait_group_events(2, e);
+}
diff --git a/kernels/compiler_async_stride_copy.cl b/kernels/compiler_async_stride_copy.cl
new file mode 100644
index 0000000..a926588
--- /dev/null
+++ b/kernels/compiler_async_stride_copy.cl
@@ -0,0 +1,16 @@
+__kernel void
+compiler_async_stride_copy(__global char4 *dst, __global char4 *src, __local char4 *localBuffer, int copiesPerWorkItem, int stride)
+{
+  event_t event;
+  int copiesPerWorkgroup = copiesPerWorkItem * get_local_size(0);
+  int i;
+  event = async_work_group_strided_copy( (__local char4*)localBuffer, (__global const char4*)(src+copiesPerWorkgroup*stride*get_group_id(0)), (size_t)copiesPerWorkgroup, (size_t)stride, (event_t)0 );
+  wait_group_events( 1, &event );
+
+  for(i=0; i<copiesPerWorkItem; i++)
+    localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] + (char4)(3);
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  event = async_work_group_strided_copy((__global char4*)(dst+copiesPerWorkgroup*stride*get_group_id(0)), (__local const char4*)localBuffer, (size_t)copiesPerWorkgroup, (size_t)stride, (event_t)0 );
+  wait_group_events( 1, &event );
+}
diff --git a/kernels/compiler_atomic_functions.cl b/kernels/compiler_atomic_functions.cl
new file mode 100644
index 0000000..fbc16fb
--- /dev/null
+++ b/kernels/compiler_atomic_functions.cl
@@ -0,0 +1,50 @@
+__kernel void compiler_atomic_functions(__global int *dst, __local int *tmp, __global int *src) {
+  int lid = get_local_id(0);
+  int i = lid % 12;
+  if(lid == 0) {
+    for(int j=0; j<12; j=j+1) {
+      atomic_xchg(&tmp[j], 0);
+    }
+    atomic_xchg(&tmp[4], -1);
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  switch(i) {
+    case 0: atomic_inc(&tmp[i]); break;
+    case 1: atomic_dec(&tmp[i]); break;
+    case 2: atomic_add(&tmp[i], src[lid]); break;
+    case 3: atomic_sub(&tmp[i], src[lid]); break;
+    case 4: atomic_and(&tmp[i], ~(src[lid]<<(lid / 16))); break;
+    case 5: atomic_or (&tmp[i], src[lid]<<(lid / 16)); break;
+    case 6: atomic_xor(&tmp[i], src[lid]); break;
+    case 7: atomic_min(&tmp[i], -src[lid]); break;
+    case 8: atomic_max(&tmp[i], src[lid]); break;
+    case 9: atomic_min((__local unsigned int *)&tmp[i], -src[lid]); break;
+    case 10: atomic_max((__local unsigned int *)&tmp[i], src[lid]); break;
+    case 11: atomic_cmpxchg(&(tmp[i]), 0, src[10]); break;
+    default:  break;
+  }
+
+  switch(i) {
+    case 0: atomic_inc(&dst[i]); break;
+    case 1: atomic_dec(&dst[i]); break;
+    case 2: atomic_add(&dst[i], src[lid]); break;
+    case 3: atomic_sub(&dst[i], src[lid]); break;
+    case 4: atomic_and(&dst[i], ~(src[lid]<<(lid / 16))); break;
+    case 5: atomic_or (&dst[i], src[lid]<<(lid / 16)); break;
+    case 6: atomic_xor(&dst[i], src[lid]); break;
+    case 7: atomic_min(&dst[i], -src[lid]); break;
+    case 8: atomic_max(&dst[i], src[lid]); break;
+    case 9: atomic_min((__global unsigned int *)&dst[i], -src[lid]); break;
+    case 10: atomic_max((__global unsigned int *)&dst[i], src[lid]); break;
+    case 11: atomic_cmpxchg(&dst[i], 0, src[10]); break;
+    default:  break;
+  }
+
+  barrier(CLK_GLOBAL_MEM_FENCE);
+
+  if(get_global_id(0) == 0) {
+    for(i=0; i<12; i=i+1)
+      atomic_xchg(&dst[i+12], tmp[i]);
+  }
+}
diff --git a/kernels/compiler_basic_arithmetic.cl b/kernels/compiler_basic_arithmetic.cl
new file mode 100644
index 0000000..3e145d8
--- /dev/null
+++ b/kernels/compiler_basic_arithmetic.cl
@@ -0,0 +1,53 @@
+#define DECL_KERNEL_SUB(type)\
+__kernel void \
+compiler_sub_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+  int id = (int)get_global_id(0); \
+  dst[id] = src0[id] - src1[id]; \
+}
+
+#define DECL_KERNEL_ADD(type)\
+__kernel void \
+compiler_add_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+  int id = (int)get_global_id(0); \
+  dst[id] = src0[id] + src1[id]; \
+}
+
+#define DECL_KERNEL_MUL(type)\
+__kernel void \
+compiler_mul_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+  int id = (int)get_global_id(0); \
+  dst[id] = src0[id] * src1[id]; \
+}
+
+#define DECL_KERNEL_DIV(type)\
+__kernel void \
+compiler_div_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+  int id = (int)get_global_id(0); \
+  dst[id] = src0[id] / src1[id]; \
+}
+
+#define DECL_KERNEL_REM(type)\
+__kernel void \
+compiler_rem_##type(__global type *src0, __global type *src1, __global type *dst) \
+{ \
+  int id = (int)get_global_id(0); \
+  dst[id] = src0[id] % src1[id]; \
+}
+
+#define DECL_KERNEL_FOR_ALL_TYPE(op) \
+DECL_KERNEL_##op(char)               \
+DECL_KERNEL_##op(uchar)              \
+DECL_KERNEL_##op(short)              \
+DECL_KERNEL_##op(ushort)             \
+DECL_KERNEL_##op(int)                \
+DECL_KERNEL_##op(uint)
+
+DECL_KERNEL_FOR_ALL_TYPE(SUB)
+DECL_KERNEL_FOR_ALL_TYPE(ADD)
+DECL_KERNEL_FOR_ALL_TYPE(MUL)
+DECL_KERNEL_FOR_ALL_TYPE(DIV)
+DECL_KERNEL_FOR_ALL_TYPE(REM)
diff --git a/kernels/compiler_bool_cross_basic_block.cl b/kernels/compiler_bool_cross_basic_block.cl
new file mode 100644
index 0000000..9aeb16d
--- /dev/null
+++ b/kernels/compiler_bool_cross_basic_block.cl
@@ -0,0 +1,21 @@
+__kernel
+void compiler_bool_cross_basic_block(__global int *src,
+				     __global int *dst,
+				     int scale){
+  int id = (int)get_global_id(0);
+
+  bool isRedRow = false;
+  bool isRed;
+  int val = src[id];
+  for (unsigned int i=0; i<scale; i++, isRedRow = !isRedRow) {
+    if (isRedRow) {
+      isRed= false;
+      for (unsigned int j=0; j < scale; j++, isRed=!isRed) {
+        if (isRed) {
+	  val++;
+        }
+      }
+    }
+  }
+  dst[id] = val;
+}
diff --git a/kernels/compiler_box_blur.cl b/kernels/compiler_box_blur.cl
new file mode 100644
index 0000000..26936e0
--- /dev/null
+++ b/kernels/compiler_box_blur.cl
@@ -0,0 +1,80 @@
+inline float3 unpack_fp3(uint u) {
+  float3 u3;
+  u3.x = (float) (u & 0xff); u >>= 8;
+  u3.y = (float) (u & 0xff); u >>= 8;
+  u3.z = (float) (u & 0xff);
+  return u3;
+}
+
+inline uint pack_fp3(float3 u3) {
+  uint u;
+  u = (((uint) u3.x)) | (((uint) u3.y) << 8) | (((uint) u3.z) << 16);
+  return u;
+}
+
+#define HFILTER3(C0, C1, C2, C3, CURR, LEFT, RIGHT)\
+  float3 C0, C1, C2, C3;\
+  do {\
+    const uint4 from = vload4(CURR, src);\
+    const float3 from0 = unpack_fp3(from.x);\
+    const float3 from1 = unpack_fp3(from.y);\
+    const float3 from2 = unpack_fp3(from.z);\
+    const float3 from3 = unpack_fp3(from.w);\
+    const float3 l = unpack_fp3(src[LEFT]);\
+    const float3 r = unpack_fp3(src[RIGHT]);\
+    C0 = (l+from0+from1);\
+    C1 = (from0+from1+from2);\
+    C2 = (from1+from2+from3);\
+    C3 = (from2+from3+r);\
+  } while(0)
+
+__kernel void compiler_box_blur(__global const uint *src,
+                                __global uint *dst,
+                                int w,
+                                int h,
+                                int chunk)
+{
+  const int x = get_global_id(0);
+  int y = get_global_id(1)*chunk;
+  const int yend = min(y + chunk, h); /* we process a tile in the image */
+
+  /* Current line (left (1 pixel), center (4 pixels), right (1 pixel)) */
+  const int left = max(4*x-1, 0) + y*w;
+  const int right = min(4*x+4, w-1) + y*w;
+  int curr = x + y*(w>>2);
+  HFILTER3(curr0, curr1, curr2, curr3, curr, left, right);
+
+  /* Top line (left (1 pixel), center (4 pixels), right (1 pixel)) */
+  const int ytop = max(y-1,0);
+  const int topLeft = max(4*x-1, 0) + ytop*w;
+  const int topRight = min(4*x+4, w-1) + ytop*w;
+  const int top = x + ytop*(w>>2);
+  HFILTER3(top0, top1, top2, top3, top, topLeft, topRight);
+
+  /* To guard bottom line */
+  const int maxBottom = x + (h-1)*(w>>2);
+  const int maxBottomLeft = max(4*x-1,0) + (h-1)*w;
+  const int maxBottomRight = min(4*x+4,w-1) + (h-1)*w;
+
+  /* We use a short 3 pixel sliding window */
+  const int ybottom = min(y+1,h-1);
+  int bottomLeft = max(4*x-1, 0) + ybottom*w;
+  int bottomRight = min(4*x+4, w-1) + ybottom*w;
+  int bottom = x + ybottom*(w>>2);
+
+  /* Top down sliding window */
+  for (; y < yend; ++y, curr += (w>>2), bottom += (w>>2), bottomLeft += w, bottomRight += w) {
+    const int center = min(bottom, maxBottom);
+    const int left = min(bottomLeft, maxBottomLeft);
+    const int right = min(bottomRight, maxBottomRight);
+    HFILTER3(bottom0, bottom1, bottom2, bottom3, center, left, right);
+    const float3 to0 = (top0+curr0+bottom0)*(1.f/9.f);
+    const float3 to1 = (top1+curr1+bottom1)*(1.f/9.f);
+    const float3 to2 = (top2+curr2+bottom2)*(1.f/9.f);
+    const float3 to3 = (top3+curr3+bottom3)*(1.f/9.f);
+    const uint4 to = (uint4)(pack_fp3(to0),pack_fp3(to1),pack_fp3(to2),pack_fp3(to3));
+    vstore4(to, curr, dst);
+    top0 = curr0; top1 = curr1; top2 = curr2; top3 = curr3;
+    curr0 = bottom0; curr1 = bottom1; curr2 = bottom2; curr3 = bottom3;
+  }
+}
diff --git a/kernels/compiler_box_blur_float.cl b/kernels/compiler_box_blur_float.cl
new file mode 100644
index 0000000..6f4e1b9
--- /dev/null
+++ b/kernels/compiler_box_blur_float.cl
@@ -0,0 +1,48 @@
+__kernel void compiler_box_blur_float(__global const float4 *src,
+                                      __global float4 *dst,
+                                      int w,
+                                      int h,
+                                      int chunk)
+{
+  const int x = get_global_id(0);
+  int y = get_global_id(1)*chunk;
+  const int yend = min(y+chunk, h); /* we process a tile in the image */
+
+  /* Current line (left (1 pixel), center (4 pixels), right (1 pixel)) */
+  const int left = max(x-1,0) + y*w;
+  const int right = min(x+1,w-1) + y*w;
+  int curr = x + y*w;
+  float4 currPixel = src[left] + src[curr] + src[right];
+
+  /* Top line (left (1 pixel), center (4 pixels), right (1 pixel)) */
+  const int ytop = max(y-1,0);
+  const int topLeft = max(x-1,0) + ytop*w;
+  const int topRight = min(x+1,w-1) + ytop*w;
+  const int top = x + ytop*w;
+  float4 topPixel = src[topLeft] + src[top] + src[topRight];
+
+  /* To guard bottom line */
+  const int maxBottom = x + (h-1)*w;
+  const int maxBottomLeft = max(x-1,0) + (h-1)*w;
+  const int maxBottomRight = min(x+1,w-1) + (h-1)*w;
+
+  /* We use a short 4 pixel sliding window */
+  const int ybottom = min(y+1,h-1);
+  int bottomLeft = max(x-1 + ybottom*w, ybottom*w);
+  int bottomRight = min(x+1 + ybottom*w, ybottom*w+w-1);
+  int bottom = x + ybottom*w;
+
+
+  /* Top down sliding window */
+  for (; y < yend; ++y, curr += w, bottom += w, bottomLeft += w, bottomRight += w) {
+    const int center = min(bottom, maxBottom);
+    const int left = min(bottomLeft, maxBottomLeft);
+    const int right = min(bottomRight, maxBottomRight);
+    const float4 bottomPixel = src[left] + src[center] + src[right];
+    const float4 to = (bottomPixel + currPixel + topPixel) * (1.f/9.f);
+    dst[curr] = to;
+    topPixel = currPixel;
+    currPixel = bottomPixel;
+  }
+}
+
diff --git a/kernels/compiler_box_blur_image.cl b/kernels/compiler_box_blur_image.cl
new file mode 100644
index 0000000..42f463b
--- /dev/null
+++ b/kernels/compiler_box_blur_image.cl
@@ -0,0 +1,18 @@
+__kernel void compiler_box_blur_image(__read_only image2d_t src,
+                                      __write_only image2d_t dst)
+{
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
+                            CLK_ADDRESS_CLAMP_TO_EDGE |
+                            CLK_FILTER_NEAREST;
+  const int2 coord = (int2)(get_global_id(0), get_global_id(1));
+  int2 offset;
+  float4 sum = 0;
+
+  for (offset.y = -1; offset.y <= 1; offset.y++) {
+    for (offset.x = -1; offset.x <= 1; offset.x++) {
+      sum +=  read_imagef(src, sampler, coord + offset);
+    }
+  }
+
+  write_imagef(dst, coord, (1.0f/9.0f)*sum);
+}
diff --git a/kernels/compiler_byte_scatter.cl b/kernels/compiler_byte_scatter.cl
new file mode 100644
index 0000000..ab56ba8
--- /dev/null
+++ b/kernels/compiler_byte_scatter.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_byte_scatter(__global char *dst)
+{
+  int id = (int) get_global_id(0);
+  dst[id] = (char) id;
+}
+
diff --git a/kernels/compiler_ceil.cl b/kernels/compiler_ceil.cl
new file mode 100644
index 0000000..cf27483
--- /dev/null
+++ b/kernels/compiler_ceil.cl
@@ -0,0 +1,4 @@
+kernel void compiler_ceil(global float *src, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = ceil(src[i]);
+}
diff --git a/kernels/compiler_clz_int.cl b/kernels/compiler_clz_int.cl
new file mode 100644
index 0000000..0f17f86
--- /dev/null
+++ b/kernels/compiler_clz_int.cl
@@ -0,0 +1,5 @@
+kernel void compiler_clz_int(global int *src, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = clz(src[i]);
+}
+
diff --git a/kernels/compiler_clz_short.cl b/kernels/compiler_clz_short.cl
new file mode 100644
index 0000000..1ecf7a9
--- /dev/null
+++ b/kernels/compiler_clz_short.cl
@@ -0,0 +1,5 @@
+kernel void compiler_clz_short(global short *src, global short *dst) {
+  int i = get_global_id(0);
+  dst[i] = clz(src[i]);
+}
+
diff --git a/kernels/compiler_constant_expr.cl b/kernels/compiler_constant_expr.cl
new file mode 100644
index 0000000..d40cead
--- /dev/null
+++ b/kernels/compiler_constant_expr.cl
@@ -0,0 +1,23 @@
+float3 foo_pow3(float3 src0, float3 src1)
+{
+  union {
+    float3 f3;
+    float   farray[4];
+  } s0, s1, dst;
+  s0.f3 = src0;
+  s1.f3 = src1;
+  int i;
+  for(i = 0; i < 3; i++)
+    dst.farray[i] = pow(s0.farray[i], s1.farray[i]);
+  return dst.f3;
+}
+
+__kernel void
+compiler_constant_expr(__global float* src, __global float *dst)
+{
+  int gid = get_global_id(0);
+  float3 f3 = vload3(gid, src);
+  float3 cf3 = (float3)(1.f, 2.f, 3.f);
+  float3 result = foo_pow3(f3, cf3);
+  vstore3(result, gid, dst); 
+} 
diff --git a/kernels/compiler_convert_uchar_sat.cl b/kernels/compiler_convert_uchar_sat.cl
new file mode 100644
index 0000000..0c81ecc
--- /dev/null
+++ b/kernels/compiler_convert_uchar_sat.cl
@@ -0,0 +1,4 @@
+kernel void compiler_convert_uchar_sat(global float *src, global uint *dst) {
+  int i = get_global_id(0);
+  dst[i] = convert_uchar_sat(src[i]);
+}
diff --git a/kernels/compiler_data_types.cl b/kernels/compiler_data_types.cl
new file mode 100644
index 0000000..79b06f3
--- /dev/null
+++ b/kernels/compiler_data_types.cl
@@ -0,0 +1,80 @@
+/* OpenCL 1.1 Supported Data Types */
+__kernel void compiler_data_types()
+{
+  // built-in scalar data types (section 6.1.1)
+  bool b;
+  b = true;
+  b = false;
+  char c;
+  unsigned char uc;
+  uchar uc_2;
+  short s;
+  unsigned short us;
+  ushort us_2;
+  int i;
+  unsigned int ui;
+  uint ui_2;
+  long l;
+  unsigned long ul;
+  ulong ul_2;
+  float f;
+  half h;
+  size_t sz;
+  ptrdiff_t pt;
+  intptr_t it;
+  uintptr_t uit;
+  
+  // built-in vector data types (section 6.1.2)
+  // supported values of $n$ are 2, 3, 4, 8, 16 for all vector data types
+#define VEC(sz) char##sz c##sz;   \
+                uchar##sz uc##sz; \
+                short##sz s##sz;  \
+                ushort##sz us##sz;\
+                int##sz i##sz;    \
+                uint##sz ui##sz;  \
+                long##sz l##sz;   \
+                ulong##sz ul##sz; \
+                float##sz f##sz;
+#if 1
+   VEC(2);
+   VEC(3);
+   VEC(4);
+   VEC(8);
+   VEC(16);
+#endif
+   float16 f_16 = (float16)(1.0f);
+   f_16.s0 += 1;
+   f_16.s1 += 1;
+   f_16.s2 += 1;
+   f_16.s3 += 1;
+   f_16.s4 += 1;
+   f_16.s5 += 1;
+   f_16.s6 += 1;
+   f_16.s7 += 1;
+   f_16.s8 += 1;
+   f_16.s9 += 1;
+   f_16.sa += 1;
+   f_16.sb += 1;
+   f_16.sc += 1;
+   f_16.sd += 1;
+   f_16.se += 1;
+   f_16.sf += 1;
+   f_16.sA += 1;
+   f_16.sB += 1;
+   f_16.sC += 1;
+   f_16.sD += 1;
+   f_16.sE += 1;
+   f_16.sF += 1;
+   float8 f_8;
+   f_8 = f_16.lo;
+   f_8 = f_16.hi;
+   f_8 = f_16.odd;
+   f_8 = f_16.even;
+   uint4 u_4 = (uint4)(1);
+
+   // Other built-in data types (section 6.1.3)
+   image2d_t i2dt;
+   image3d_t i3dt;
+   sampler_t st;
+   event_t et;
+}
diff --git a/kernels/compiler_degrees.cl b/kernels/compiler_degrees.cl
new file mode 100644
index 0000000..5fad995
--- /dev/null
+++ b/kernels/compiler_degrees.cl
@@ -0,0 +1,4 @@
+kernel void compiler_degrees(global float *src, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = degrees(src[i]);
+}
diff --git a/kernels/compiler_displacement_map_element.cl b/kernels/compiler_displacement_map_element.cl
new file mode 100644
index 0000000..ee40ad5
--- /dev/null
+++ b/kernels/compiler_displacement_map_element.cl
@@ -0,0 +1,11 @@
+kernel void compiler_displacement_map_element(const global uint *in, const global uint *offset, int w, int h, global uint *out) {
+    const int cx = get_global_id(0);
+    const int cy = get_global_id(1);
+    uint c = offset[cy * w + cx];
+    int x_pos = cx + c;
+    int y_pos = cy + c;
+    if(0 <= x_pos && x_pos < w && 0 <= y_pos && y_pos < h)
+        out[cy * w + cx] = in[y_pos * w + x_pos];
+    else
+        out[cy * w + cx] = 0;
+}
diff --git a/kernels/compiler_double.cl b/kernels/compiler_double.cl
new file mode 100644
index 0000000..a84f142
--- /dev/null
+++ b/kernels/compiler_double.cl
@@ -0,0 +1,9 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double(global double *src, global double *dst) {
+  int i = get_global_id(0);
+  double d = 1.234567890123456789;
+  if (i < 14)
+    dst[i] = d * (src[i] + d);
+  else
+    dst[i] = 14;
+}
diff --git a/kernels/compiler_double_2.cl b/kernels/compiler_double_2.cl
new file mode 100644
index 0000000..20ee614
--- /dev/null
+++ b/kernels/compiler_double_2.cl
@@ -0,0 +1,9 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double_2(global float *src, global double *dst) {
+  int i = get_global_id(0);
+  float d = 1.234567890123456789f;
+  if (i < 14)
+    dst[i] = d * (d + src[i]);
+  else
+    dst[i] = 14;
+}
diff --git a/kernels/compiler_double_3.cl b/kernels/compiler_double_3.cl
new file mode 100644
index 0000000..8b32404
--- /dev/null
+++ b/kernels/compiler_double_3.cl
@@ -0,0 +1,6 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double_3(global float *src, global double *dst) {
+  int i = get_global_id(0);
+  float d = 1.234567890123456789f;
+  dst[i] = i < 14 ? d : 14;
+}
diff --git a/kernels/compiler_double_4.cl b/kernels/compiler_double_4.cl
new file mode 100644
index 0000000..e5e46f9
--- /dev/null
+++ b/kernels/compiler_double_4.cl
@@ -0,0 +1,5 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_double_4(global double *src1, global double *src2, global double *dst) {
+  int i = get_global_id(0);
+  dst[i] = src1[i] + src2[i];
+}
diff --git a/kernels/compiler_event.cl b/kernels/compiler_event.cl
new file mode 100644
index 0000000..a901b05
--- /dev/null
+++ b/kernels/compiler_event.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_event(__global int *dst, int value)
+{
+  int id = (int)get_global_id(0);
+  dst[id] += value;
+}
diff --git a/kernels/compiler_fabs.cl b/kernels/compiler_fabs.cl
new file mode 100644
index 0000000..016deb8
--- /dev/null
+++ b/kernels/compiler_fabs.cl
@@ -0,0 +1,5 @@
+kernel void compiler_fabs(global float *src, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = fabs(src[i]);
+}
+
diff --git a/kernels/compiler_function_argument.cl b/kernels/compiler_function_argument.cl
new file mode 100644
index 0000000..fe6de28
--- /dev/null
+++ b/kernels/compiler_function_argument.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_function_argument(__global int *dst, int value)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = value;
+}
+
diff --git a/kernels/compiler_function_argument0.cl b/kernels/compiler_function_argument0.cl
new file mode 100644
index 0000000..6bc2e92
--- /dev/null
+++ b/kernels/compiler_function_argument0.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_function_argument0(__global int *dst, short value)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = value;
+}
+
diff --git a/kernels/compiler_function_argument1.cl b/kernels/compiler_function_argument1.cl
new file mode 100644
index 0000000..8842b0b
--- /dev/null
+++ b/kernels/compiler_function_argument1.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_function_argument1(__global int *dst, char value, short value0, int value1)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = value + value0 + value1;
+}
+
diff --git a/kernels/compiler_function_argument2.cl b/kernels/compiler_function_argument2.cl
new file mode 100644
index 0000000..24e5795
--- /dev/null
+++ b/kernels/compiler_function_argument2.cl
@@ -0,0 +1,12 @@
+__kernel void compiler_function_argument2(
+char8 c, uchar8 uc, short8 s, ushort8 us, int8 i, uint8 ui, float8 f,
+__global float8 *result)
+{
+  result[0] = convert_float8(c);
+  result[1] = convert_float8(uc);
+  result[2] = convert_float8(s);
+  result[3] = convert_float8(us);
+  result[4] = convert_float8(i);
+  result[5] = convert_float8(ui);
+  result[6] = f;
+}
diff --git a/kernels/compiler_function_argument3.cl b/kernels/compiler_function_argument3.cl
new file mode 100644
index 0000000..9395cd7
--- /dev/null
+++ b/kernels/compiler_function_argument3.cl
@@ -0,0 +1,71 @@
+struct sfloat8 {
+    float a;
+    float b;
+    float c;
+    float d;
+    float e;
+    float f;
+    float g;
+    float h;
+};
+
+
+__kernel void compiler_function_argument3(
+struct sfloat8 f, __global struct sfloat8 *result)
+{
+  result[0].a = f.a;
+  result[0].b = 12.0f;
+  result[0].c = 12.0f;
+  result[0].d = 12.0f;
+  result[0].e = 12.0f;
+  result[0].f = 12.0f;
+  result[0].g = 12.0f;
+  result[0].h = f.a + f.h;
+
+  result[1].a = f.a;
+  result[1].b = 12.0f;
+  result[1].c = 12.0f;
+  result[1].d = 12.0f;
+  result[1].e = 12.0f;
+  result[1].f = 12.0f;
+  result[1].g = 12.0f;
+  result[1].h = f.a + f.h;
+
+  result[2].a = f.a;
+  result[2].b = 12.0f;
+  result[2].c = 12.0f;
+  result[2].d = 12.0f;
+  result[2].e = 12.0f;
+  result[2].f = 12.0f;
+  result[2].g = 12.0f;
+  result[2].h = f.a + f.h;
+
+  result[3].a = f.a;
+  result[3].b = 12.0f;
+  result[3].c = 12.0f;
+  result[3].d = 12.0f;
+  result[3].e = 12.0f;
+  result[3].f = 12.0f;
+  result[3].g = 12.0f;
+  result[3].h = f.a + f.h;
+
+  result[4].a = f.a;
+  result[4].b = 12.0f;
+  result[4].c = 12.0f;
+  result[4].d = 12.0f;
+  result[4].e = 12.0f;
+  result[4].f = 12.0f;
+  result[4].g = 12.0f;
+  result[4].h = f.a + f.h;
+
+  result[5].a = f.a;
+  result[5].b = 12.0f;
+  result[5].c = 12.0f;
+  result[5].d = 12.0f;
+  result[5].e = 12.0f;
+  result[5].f = 12.0f;
+  result[5].g = 12.0f;
+  result[5].h = f.a + f.h;
+
+  result[6] = result[0];
+}
diff --git a/kernels/compiler_function_constant.cl b/kernels/compiler_function_constant.cl
new file mode 100644
index 0000000..ca7e874
--- /dev/null
+++ b/kernels/compiler_function_constant.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_function_constant(__constant short *c, __global int *dst, int value)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = value + c[id%69];
+}
diff --git a/kernels/compiler_function_constant0.cl b/kernels/compiler_function_constant0.cl
new file mode 100644
index 0000000..5340352
--- /dev/null
+++ b/kernels/compiler_function_constant0.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_function_constant0(__constant int *c0, __constant char *c1, __global int *dst, int value)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = value + c0[id%69] + c1[0];
+}
diff --git a/kernels/compiler_function_qualifiers.cl b/kernels/compiler_function_qualifiers.cl
new file mode 100644
index 0000000..c904c84
--- /dev/null
+++ b/kernels/compiler_function_qualifiers.cl
@@ -0,0 +1,9 @@
+/* test OpenCL 1.1 Function Qualifiers (section 6.7) */
+kernel void compiler_function_qualifiers()
+__attribute__((vec_type_hint(float)))
+__attribute__((work_group_size_hint(4,1,1)))
+__attribute__((reqd_work_group_size(4,1,1)));
+
+kernel void compiler_function_qualifiers()
+{
+}
diff --git a/kernels/compiler_gather_register_file.cl b/kernels/compiler_gather_register_file.cl
new file mode 100644
index 0000000..773797d
--- /dev/null
+++ b/kernels/compiler_gather_register_file.cl
@@ -0,0 +1,10 @@
+__kernel void
+compiler_gather_register_file(__global uint *src, __global uint *dst)
+{
+  __gen_ocl_force_simd16();
+  int id = (int)get_global_id(0);
+  const int x0 = src[id];
+  const unsigned short index = get_global_id(0);
+  dst[id] = __gen_ocl_rgather(index, x0);
+}
+
diff --git a/kernels/compiler_gather_register_file0.cl b/kernels/compiler_gather_register_file0.cl
new file mode 100644
index 0000000..0e6d487
--- /dev/null
+++ b/kernels/compiler_gather_register_file0.cl
@@ -0,0 +1,10 @@
+__kernel void
+compiler_gather_register_file0(__global uint *src, __global uint *dst)
+{
+  __gen_ocl_force_simd16();
+  int id = (int)get_global_id(0);
+  const int x0 = src[id];
+  const unsigned short index = 15 - get_global_id(0);
+  dst[id] = __gen_ocl_rgather(index, x0);
+}
+
diff --git a/kernels/compiler_gather_register_file1.cl b/kernels/compiler_gather_register_file1.cl
new file mode 100644
index 0000000..184202c
--- /dev/null
+++ b/kernels/compiler_gather_register_file1.cl
@@ -0,0 +1,11 @@
+__kernel void
+compiler_gather_register_file1(__global uint *src, __global uint *dst)
+{
+  __gen_ocl_force_simd16();
+  int id = (int)get_global_id(0);
+  const int x0 = src[id];
+  const int x1 = src[id+16];
+  const unsigned short index = 2*get_global_id(0);
+  dst[id] = __gen_ocl_rgather(index, x0, x1);
+}
+
diff --git a/kernels/compiler_geometric_builtin.cl b/kernels/compiler_geometric_builtin.cl
new file mode 100644
index 0000000..34ff761
--- /dev/null
+++ b/kernels/compiler_geometric_builtin.cl
@@ -0,0 +1,11 @@
+kernel void compiler_geometric_builtin() {
+  float x = 1, y = 2, z = 3;
+  z = dot(x, y);
+  z = cross(x, y);
+  z = distance(x, y);
+  z = length(x);
+  z = normalize(x);
+  z = fast_distance(x, y);
+  z = fast_length(x, y);
+  z = fast_normalize(x);
+}
diff --git a/kernels/compiler_getelementptr_bitcast.cl b/kernels/compiler_getelementptr_bitcast.cl
new file mode 100644
index 0000000..0320abf
--- /dev/null
+++ b/kernels/compiler_getelementptr_bitcast.cl
@@ -0,0 +1,18 @@
+__kernel void compiler_getelementptr_bitcast(global float *src, global float *dst)
+{
+  int i = get_global_id(0);
+
+  __local  float ldata[256];
+  ldata[get_local_id(0)] = src[i];
+
+  //if use get_local_id(0) to index ldata, the issue is not reproduced
+  //so, just set the work group as 1 in the application
+  __local uchar *  pldata = (__local uchar *)&ldata[0];
+  uchar data;
+  for(int k = 0; k < 3; k++){
+    data = *pldata;
+    pldata++;
+  }
+
+  dst[i] = data;
+}
diff --git a/kernels/compiler_global_constant.cl b/kernels/compiler_global_constant.cl
new file mode 100644
index 0000000..c0e23d1
--- /dev/null
+++ b/kernels/compiler_global_constant.cl
@@ -0,0 +1,76 @@
+constant int m[3] = {71,72,73};
+const constant int n = 1;
+constant int o[3] = {3, 2, 1};
+
+constant int4 a= {1, 2, 3, 4};
+constant int4 b = {0, -1, -2, -3};
+
+struct Person {
+  char name[7];
+  int3 idNumber;
+};
+
+struct Test1 {
+  int a0;
+  char a1;
+};
+
+struct Test2 {
+  char a0;
+  int a1;
+};
+struct Test3 {
+  int a0;
+  int a1;
+};
+struct Test4 {
+  float a0;
+  float a1;
+};
+
+constant struct Person james= {{"james"}, (int3)(1, 2, 3)};
+constant struct Test1 t0 = {1, 2};
+constant struct Test2 t1 = {1, 2};
+
+constant int3 c[3] = {(int3)(0, 1, 2), (int3)(3, 4, 5), (int3)(6,7,8) };
+constant char4 d[3] = {(char4)(0, 1, 2, 3), (char4)(4, 5, 6, 7), (char4)(8, 9, 10, 11)};
+
+constant struct Person members[3] = {{{"abc"}, (int3)(1, 2, 3)}, { {"defg"}, (int3)(4,5,6)}, { {"hijk"}, (int3)(7,8,9)} };
+constant struct Test3 zero_struct = {0, 0};
+constant int3 zero_vec = {0,0,0};
+constant int zero_arr[3] = {0,0,0};
+constant float zero_flt[3] = {0.0f, 0.0f, 0.0f};
+
+__kernel void
+compiler_global_constant(__global int *dst, int e, int r)
+{
+  int id = (int)get_global_id(0);
+
+  int4 x = a + b;
+  dst[id] = m[id%3] * n * o[2] + e + r *x.y * a.x + zero_struct.a0 + zero_vec.x + zero_arr[1] + (int)zero_flt[2];
+}
+// array of vectors
+__kernel void
+compiler_global_constant1(__global int *dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = c[id%3].y + d[id%3].w;
+}
+
+// structure
+__kernel void
+compiler_global_constant2(__global int *dst)
+{
+  int id = (int)get_global_id(0);
+
+  dst[id] = james.idNumber.y + t0.a1 + t1.a1;
+}
+
+//array of structure
+__kernel void
+compiler_global_constant3(__global int *dst)
+{
+  int id = (int)get_global_id(0);
+
+  dst[id] = members[id%3].idNumber.z + members[id%3].name[2];
+}
diff --git a/kernels/compiler_global_constant_2.cl b/kernels/compiler_global_constant_2.cl
new file mode 100644
index 0000000..04536c7
--- /dev/null
+++ b/kernels/compiler_global_constant_2.cl
@@ -0,0 +1,20 @@
+constant int m[3] = {0x15b,0x25b,0x35b};
+constant short t[5] = {0x45b,0x55b,0x65b,0x75b,0x85b};
+constant long n[3] = {0x15b,0x25b,0xFFFFFFFFF};
+constant long p[3] = {1,1,1};
+constant long s = 1;
+
+
+__kernel void
+compiler_global_constant_2(__global int *dst, int e, int r)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = m[id%3] + t[id%5] + e + r;
+}
+
+__kernel void
+compiler_global_constant_2_long(__global long *dst, int e, int r)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = n[id%3]*p[1] + e*s + r;
+}
diff --git a/kernels/compiler_global_memory_barrier.cl b/kernels/compiler_global_memory_barrier.cl
new file mode 100644
index 0000000..99bb940
--- /dev/null
+++ b/kernels/compiler_global_memory_barrier.cl
@@ -0,0 +1,7 @@
+__kernel void compiler_global_memory_barrier(__global int *dst, __global int *src) {
+  src[get_local_size(0) * (2 * get_group_id(0)) + get_local_id(0)] = get_local_id(0);
+  src[get_local_size(0) * (2 * get_group_id(0) + 1) + get_local_id(0)] = get_local_id(0);
+  barrier(CLK_GLOBAL_MEM_FENCE);
+  dst[get_local_size(0) * (2 * get_group_id(0)) + get_local_id(0)] = src[get_local_size(0) * 2 * get_group_id(0) + get_local_size(0) - (get_local_id(0) + 1)];
+  dst[get_local_size(0) * (2 * get_group_id(0) + 1) + get_local_id(0)] = src[get_local_size(0) * (2 * get_group_id(0) + 1) + get_local_size(0) - (get_local_id(0) + 1)];
+}
diff --git a/kernels/compiler_group_size.cl b/kernels/compiler_group_size.cl
new file mode 100644
index 0000000..4e2c333
--- /dev/null
+++ b/kernels/compiler_group_size.cl
@@ -0,0 +1,29 @@
+__kernel void
+compiler_group_size(__global unsigned int *dst)
+{
+  uint idx = (uint)get_global_id(0);
+  uint idy = (uint)get_global_id(1);
+  uint idz = (uint)get_global_id(2);
+  uint size_x = (uint)get_global_size(0);
+  uint size_y = (uint)get_global_size(1);
+
+  dst[idz*size_x*size_y + idy*size_x + idx] = idz*size_x*size_y + idy*size_x +idx;
+}
+
+struct xyz{
+  unsigned short b;
+  unsigned short e;
+  unsigned int o;
+};
+
+__kernel void
+compiler_group_size4(__global struct xyz *src, __global unsigned int *dst, unsigned int num, unsigned int c)
+{
+  uint idx = (uint)get_global_id(0);
+  if(idx>=num)
+    return;
+  struct xyz td = src[idx];
+  for(unsigned x = td.b;x<=td.e;x++)
+    dst[td.o+x] = c;
+}
+
diff --git a/kernels/compiler_hadd.cl b/kernels/compiler_hadd.cl
new file mode 100644
index 0000000..fe50195
--- /dev/null
+++ b/kernels/compiler_hadd.cl
@@ -0,0 +1,4 @@
+kernel void compiler_hadd(global int *src1, global int *src2, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = hadd(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_if_else.cl b/kernels/compiler_if_else.cl
new file mode 100644
index 0000000..7ae8f99
--- /dev/null
+++ b/kernels/compiler_if_else.cl
@@ -0,0 +1,14 @@
+__kernel void
+compiler_if_else(__global int *src, __global int *dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = src[id];
+  if (dst[id] >= 0) {
+    dst[id] = src[id+1];
+    src[id] = 1;
+  } else {
+    dst[id]--;
+    src[id] = 2;
+  }
+}
+
diff --git a/kernels/compiler_insert_to_constant.cl b/kernels/compiler_insert_to_constant.cl
new file mode 100644
index 0000000..f94c5c3
--- /dev/null
+++ b/kernels/compiler_insert_to_constant.cl
@@ -0,0 +1,6 @@
+__kernel void compiler_insert_to_constant(__global int4 *dst) {
+  int4 value = (int4)(0,1,2,3);
+  value.z = get_global_id(0);
+  dst[get_global_id(0)] = value;
+}
+
diff --git a/kernels/compiler_insert_vector.cl b/kernels/compiler_insert_vector.cl
new file mode 100644
index 0000000..0f0e20f
--- /dev/null
+++ b/kernels/compiler_insert_vector.cl
@@ -0,0 +1,11 @@
+__kernel void
+compiler_insert_vector(__global int4 *out )
+{
+    int tid = get_global_id(0);
+    int4 output = (int4)(0, 0, 0, 1); //black
+    if (tid > 16)
+    {
+        output = (int4)(tid, tid, 1, 1);
+    }
+    out[tid] = output;
+}
diff --git a/kernels/compiler_insn_selection_masked_min_max.cl b/kernels/compiler_insn_selection_masked_min_max.cl
new file mode 100644
index 0000000..5b4be57
--- /dev/null
+++ b/kernels/compiler_insn_selection_masked_min_max.cl
@@ -0,0 +1,11 @@
+__kernel void
+compiler_insn_selection_masked_min_max(__global float* src, __global float* dst)
+{
+  int id = (int)get_global_id(0);
+  if (get_local_id(0) > 5)
+    dst[id] = max(src[id], src[7]);
+  else
+    dst[id] = min(src[id], src[10]);
+}
+
+
diff --git a/kernels/compiler_insn_selection_max.cl b/kernels/compiler_insn_selection_max.cl
new file mode 100644
index 0000000..762de2b
--- /dev/null
+++ b/kernels/compiler_insn_selection_max.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_insn_selection_max(__global float* src, __global float* dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = max(src[id], src[0]);
+}
+
diff --git a/kernels/compiler_insn_selection_min.cl b/kernels/compiler_insn_selection_min.cl
new file mode 100644
index 0000000..6800eaf
--- /dev/null
+++ b/kernels/compiler_insn_selection_min.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_insn_selection_min(__global float* src, __global float* dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = min(src[id], src[0]);
+}
+
diff --git a/kernels/compiler_integer_builtin.cl b/kernels/compiler_integer_builtin.cl
new file mode 100644
index 0000000..4faacd6
--- /dev/null
+++ b/kernels/compiler_integer_builtin.cl
@@ -0,0 +1,23 @@
+/* test OpenCL 1.1 Integet Built-in Functions (section 6.11.3) */
+__kernel void compiler_integer_builtin() {
+  int i = 0, i1 = -1, i2 = -2;
+  unsigned u = 1, u1 = 2, u2 = 3;
+  i = CHAR_MAX;
+  i = abs(u);
+  i = abs_diff(u1, u2);
+  i = add_sat(i1, i2);
+  i = hadd(i1, i2);
+  i = rhadd(i1, i2);
+  i = clz(i);
+  i = clamp(i, i1, i2);
+  i = mad_hi(i, i1, i2);
+  i = mad_sat(i, i1, i2);
+  i = max(i1, i2);
+  i = min(i1, i2);
+  i = mul_hi(i1, i2);
+  i = rotate(i1, i2);
+  i = sub_sat(i1, i2);
+  long l = upsample(i, u);
+  i = mad24(i, i1, i2);
+  i = mul24(i1, i2);
+}
diff --git a/kernels/compiler_integer_division.cl b/kernels/compiler_integer_division.cl
new file mode 100644
index 0000000..146daa0
--- /dev/null
+++ b/kernels/compiler_integer_division.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_integer_division(__global int *src, __global int *dst, int x)
+{
+  dst[get_global_id(0)] = src[get_global_id(0)] / x;
+}
+
diff --git a/kernels/compiler_integer_remainder.cl b/kernels/compiler_integer_remainder.cl
new file mode 100644
index 0000000..73558cb
--- /dev/null
+++ b/kernels/compiler_integer_remainder.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_integer_remainder(__global int *src, __global int *dst, int x)
+{
+  dst[get_global_id(0)] = src[get_global_id(0)] % x;
+}
+
diff --git a/kernels/compiler_load_bool_imm.cl b/kernels/compiler_load_bool_imm.cl
new file mode 100644
index 0000000..fda49b9
--- /dev/null
+++ b/kernels/compiler_load_bool_imm.cl
@@ -0,0 +1,12 @@
+__kernel void
+compiler_load_bool_imm(__global int *dst, __local int *localBuffer, int copiesPerWorkItem )
+{
+  int i;
+  for(i=0; i<copiesPerWorkItem; i++)
+    localBuffer[get_local_id(0)*copiesPerWorkItem+i] = copiesPerWorkItem;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(i=0; i<copiesPerWorkItem; i++)
+    dst[get_global_id(0)*copiesPerWorkItem + i] = localBuffer[get_local_id(0)*copiesPerWorkItem+i];
+  barrier(CLK_LOCAL_MEM_FENCE);
+}
diff --git a/kernels/compiler_local_memory_barrier.cl b/kernels/compiler_local_memory_barrier.cl
new file mode 100644
index 0000000..39a94b8
--- /dev/null
+++ b/kernels/compiler_local_memory_barrier.cl
@@ -0,0 +1,6 @@
+__kernel void compiler_local_memory_barrier(__global int *dst, __local int *src) {
+  src[get_local_id(0)] = get_local_id(0);
+  barrier(CLK_LOCAL_MEM_FENCE);
+  dst[get_global_id(0)] = src[15 - get_local_id(0)];
+}
+
diff --git a/kernels/compiler_local_memory_barrier_2.cl b/kernels/compiler_local_memory_barrier_2.cl
new file mode 100644
index 0000000..dca4a9c
--- /dev/null
+++ b/kernels/compiler_local_memory_barrier_2.cl
@@ -0,0 +1,7 @@
+__kernel void compiler_local_memory_barrier_2(__global int *dst, __local int *src) {
+  src[get_local_id(0)] = get_local_id(0);
+  src[get_local_size(0) + get_local_id(0)] = get_local_id(0);
+  barrier(CLK_LOCAL_MEM_FENCE);
+  dst[get_local_size(0) * (2 * get_group_id(0)) + get_local_id(0)] = src[get_local_size(0) - (get_local_id(0) + 1)];
+  dst[get_local_size(0) * (2 * get_group_id(0) + 1) + get_local_id(0)] = src[get_local_size(0) + get_local_size(0) - (get_local_id(0) + 1)];
+}
diff --git a/kernels/compiler_local_memory_barrier_wg64.cl b/kernels/compiler_local_memory_barrier_wg64.cl
new file mode 100644
index 0000000..b2ea906
--- /dev/null
+++ b/kernels/compiler_local_memory_barrier_wg64.cl
@@ -0,0 +1,6 @@
+__kernel void compiler_local_memory_barrier_wg64(__global int *dst, __local int *src) {
+  src[get_local_id(0)] = get_local_id(0);
+  barrier(CLK_LOCAL_MEM_FENCE);
+  dst[get_global_id(0)] = src[63 - get_local_id(0)];
+}
+
diff --git a/kernels/compiler_local_memory_two_ptr.cl b/kernels/compiler_local_memory_two_ptr.cl
new file mode 100644
index 0000000..46589ba
--- /dev/null
+++ b/kernels/compiler_local_memory_two_ptr.cl
@@ -0,0 +1,10 @@
+__kernel void compiler_local_memory_two_ptr(__global int *dst,
+                                            __local int *src0,
+                                            __local int *src1)
+{
+  src0[get_local_id(0)] = get_local_id(0);
+  src1[get_local_id(0)] = get_global_id(0);
+  barrier(CLK_LOCAL_MEM_FENCE);
+  dst[get_global_id(0)] = src0[15 - get_local_id(0)] + src1[15 - get_local_id(0)];
+}
+
diff --git a/kernels/compiler_local_slm.cl b/kernels/compiler_local_slm.cl
new file mode 100644
index 0000000..52c078c
--- /dev/null
+++ b/kernels/compiler_local_slm.cl
@@ -0,0 +1,24 @@
+struct Test{
+  char t0;
+  int t1;
+};
+
+constant int two= 2;
+
+__kernel void compiler_local_slm(__global int *dst) {
+  __local int hop[16];
+  __local char a;
+  __local struct Test c;
+
+  c.t1 = get_group_id(0);
+  a = two;// seems clang currently has a bug if I write 'a=2;' so currently workaroud it.
+  hop[get_local_id(0)] = get_local_id(0);
+  barrier(CLK_LOCAL_MEM_FENCE);
+  dst[get_global_id(0)] = hop[get_local_id(0)] + (int)a + hop[1] + c.t1;
+}
+
+__kernel void compiler_local_slm1(__global ulong *dst) {
+  __local int hop[16];
+  dst[1] = (ulong)&hop[1];
+  dst[0] = (ulong)&hop[0];
+}
diff --git a/kernels/compiler_long.cl b/kernels/compiler_long.cl
new file mode 100644
index 0000000..e69c5bf
--- /dev/null
+++ b/kernels/compiler_long.cl
@@ -0,0 +1,8 @@
+kernel void compiler_long(global long *src1, global long *src2, global long *dst, long zero) {
+  int i = get_global_id(0);
+
+  if(i < 5)
+    dst[i] = src1[i] + src2[i] + src2[i]*zero;
+  if(i > 5)
+    dst[i] = src1[i] - src2[i] - zero;
+}
diff --git a/kernels/compiler_long_2.cl b/kernels/compiler_long_2.cl
new file mode 100644
index 0000000..92be93a
--- /dev/null
+++ b/kernels/compiler_long_2.cl
@@ -0,0 +1,20 @@
+kernel void compiler_long_2(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  switch(i) {
+    case 0:
+      dst[i] = 0xFEDCBA9876543210UL;
+      break;
+    case 1:
+      dst[i] = src1[i] & src2[i];
+      break;
+    case 2:
+      dst[i] = src1[i] | src2[i];
+      break;
+    case 3:
+      dst[i] = src1[i] ^ src2[i];
+      break;
+    case 4:
+      dst[i] = src1[i] ? 0x1122334455667788L : 0x8877665544332211UL;
+      break;
+  }
+}
diff --git a/kernels/compiler_long_asr.cl b/kernels/compiler_long_asr.cl
new file mode 100644
index 0000000..901630b
--- /dev/null
+++ b/kernels/compiler_long_asr.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_asr(global long *src, global long *dst) {
+  int i = get_global_id(0);
+  if(i > 7)
+    dst[i] = src[i] >> i;
+  else
+    dst[i] = src[i] + 1;
+}
diff --git a/kernels/compiler_long_cmp.cl b/kernels/compiler_long_cmp.cl
new file mode 100644
index 0000000..90dfb60
--- /dev/null
+++ b/kernels/compiler_long_cmp.cl
@@ -0,0 +1,29 @@
+kernel void compiler_long_cmp_l(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = (src1[i] < src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_le(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = (src1[i] <= src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_g(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = (src1[i] > src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_ge(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = (src1[i] >= src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_eq(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = (src1[i] == src2[i]) ? 3 : 4;
+}
+
+kernel void compiler_long_cmp_neq(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = (src1[i] != src2[i]) ? 3 : 4;
+}
diff --git a/kernels/compiler_long_convert.cl b/kernels/compiler_long_convert.cl
new file mode 100644
index 0000000..e5f7939
--- /dev/null
+++ b/kernels/compiler_long_convert.cl
@@ -0,0 +1,19 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+kernel void compiler_long_convert(global char *src1, global short *src2, global int *src3, global long *dst1, global long *dst2, global long *dst3) {
+  int i = get_global_id(0);
+  dst1[i] = src1[i];
+  dst2[i] = src2[i];
+  dst3[i] = src3[i];
+}
+
+kernel void compiler_long_convert_2(global char *dst1, global short *dst2, global int *dst3, global long *src) {
+  int i = get_global_id(0);
+  dst1[i] = src[i];
+  dst2[i] = src[i];
+  dst3[i] = src[i];
+}
+
+kernel void compiler_long_convert_to_float(global float *dst, global long *src) {
+  int i = get_global_id(0);
+  dst[i] = src[i];
+}
diff --git a/kernels/compiler_long_mult.cl b/kernels/compiler_long_mult.cl
new file mode 100644
index 0000000..5b96d74
--- /dev/null
+++ b/kernels/compiler_long_mult.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_mult(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  if(i < 3)
+    dst[i] = src1[i] + src2[i];
+  else
+    dst[i] = src1[i] * src2[i];
+}
diff --git a/kernels/compiler_long_shl.cl b/kernels/compiler_long_shl.cl
new file mode 100644
index 0000000..3786b77
--- /dev/null
+++ b/kernels/compiler_long_shl.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_shl(global long *src, global long *dst) {
+  int i = get_global_id(0);
+  if(i > 7)
+    dst[i] = src[i] << i;
+  else
+    dst[i] = src[i] + 1;
+}
diff --git a/kernels/compiler_long_shr.cl b/kernels/compiler_long_shr.cl
new file mode 100644
index 0000000..d4e859c
--- /dev/null
+++ b/kernels/compiler_long_shr.cl
@@ -0,0 +1,7 @@
+kernel void compiler_long_shr(global ulong *src, global ulong *dst) {
+  int i = get_global_id(0);
+  if(i > 7)
+    dst[i] = src[i] >> i;
+  else
+    dst[i] = src[i] + 1;
+}
diff --git a/kernels/compiler_lower_return0.cl b/kernels/compiler_lower_return0.cl
new file mode 100644
index 0000000..fd9846e
--- /dev/null
+++ b/kernels/compiler_lower_return0.cl
@@ -0,0 +1,8 @@
+__kernel void
+compiler_lower_return0(__global int *src, __global int *dst) {
+  const int id = get_global_id(0);
+  dst[id] = id;
+  if (src[id] > 0) return;
+  dst[id] = src[id];
+}
+
diff --git a/kernels/compiler_lower_return1.cl b/kernels/compiler_lower_return1.cl
new file mode 100644
index 0000000..bcb6b7f
--- /dev/null
+++ b/kernels/compiler_lower_return1.cl
@@ -0,0 +1,8 @@
+__kernel void
+compiler_lower_return1(__global int *src, __global int *dst) {
+  const int id = get_global_id(0);
+  dst[id] = id;
+  if (id < 11 && (src[id] > 0 || src[id+16] < 2)) return;
+  dst[id] = src[id];
+}
+
diff --git a/kernels/compiler_lower_return2.cl b/kernels/compiler_lower_return2.cl
new file mode 100644
index 0000000..9fa8ad6
--- /dev/null
+++ b/kernels/compiler_lower_return2.cl
@@ -0,0 +1,11 @@
+__kernel void
+compiler_lower_return2(__global int *src, __global int *dst) {
+  const int id = get_global_id(0);
+  dst[id] = id;
+  while (dst[id] > src[id]) {
+    if (dst[id] > 10) return;
+    dst[id]--;
+  }
+  dst[id] += 2;
+}
+
diff --git a/kernels/compiler_mad24.cl b/kernels/compiler_mad24.cl
new file mode 100644
index 0000000..04bb2c5
--- /dev/null
+++ b/kernels/compiler_mad24.cl
@@ -0,0 +1,4 @@
+kernel void compiler_mad24(global int *src1, global int *src2, global int *src3, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = mad24(src1[i], src2[i], src3[i]);
+}
diff --git a/kernels/compiler_mad_hi.cl b/kernels/compiler_mad_hi.cl
new file mode 100644
index 0000000..82b09c7
--- /dev/null
+++ b/kernels/compiler_mad_hi.cl
@@ -0,0 +1,4 @@
+kernel void compiler_mad_hi(global int *src1, global int *src2, global int *src3, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = mad_hi(src1[i], src2[i], src3[i]);
+}
diff --git a/kernels/compiler_mandelbrot.cl b/kernels/compiler_mandelbrot.cl
new file mode 100644
index 0000000..d15ccd0
--- /dev/null
+++ b/kernels/compiler_mandelbrot.cl
@@ -0,0 +1,47 @@
+// Used to ID into the 1D array, so that we can use
+// it effectively as a 2D array
+inline int ID(int x, int y, int width) { return 4*width*y + x*4; }
+inline float mapX(float x) { return x*3.25f - 2.f; }
+inline float mapY(float y) { return y*2.5f - 1.25f; }
+
+__kernel void compiler_mandelbrot(__global char *out) {
+  int x_dim = get_global_id(0);
+  int y_dim = get_global_id(1);
+  int width = get_global_size(0);
+  int height = get_global_size(1);
+  int idx = ID(x_dim, y_dim, width);
+
+  float x_origin = mapX((float) x_dim / (float) width);
+  float y_origin = mapY((float) y_dim / (float) height);
+
+  // The Escape time algorithm, it follows the pseduocode from Wikipedia
+  // _very_ closely
+  float x = 0.0f;
+  float y = 0.0f;
+
+  int iteration = 0;
+
+  // This can be changed, to be more or less precise
+  int max_iteration = 256;
+  while(x*x + y*y <= 4 && iteration < max_iteration) {
+    float xtemp = x*x - y*y + x_origin;
+    y = 2*x*y + y_origin;
+    x = xtemp;
+    iteration++;
+  }
+
+  if(iteration == max_iteration) {
+    // This coordinate did not escape, so it is in the Mandelbrot set
+    out[idx] = 0;
+    out[idx + 1] = 0;
+    out[idx + 2] = 0;
+    out[idx + 3] = 255;
+  } else {
+    // This coordinate did escape, so color based on quickly it escaped
+    out[idx] = iteration;
+    out[idx + 1] = iteration;
+    out[idx + 2] = iteration;
+    out[idx + 3] = 255;
+  }
+
+}
diff --git a/kernels/compiler_mandelbrot_alternate.cl b/kernels/compiler_mandelbrot_alternate.cl
new file mode 100644
index 0000000..ab6fb07
--- /dev/null
+++ b/kernels/compiler_mandelbrot_alternate.cl
@@ -0,0 +1,38 @@
+inline int offset(int x, int y, int width) { return width*y + x; }
+inline float mapX(float x) {return x*3.25f - 2.f;}
+inline float mapY(float y) {return y*2.5f - 1.25f;}
+
+__kernel void compiler_mandelbrot_alternate(__global uint *out,
+                                            float rcpWidth,
+                                            float rcpHeight,
+                                            float criterium)
+{
+  int xDim = get_global_id(0);
+  int yDim = get_global_id(1);
+  int width = get_global_size(0);
+  int height = get_global_size(1);
+  int idx = offset(xDim, yDim, width);
+
+  float xOrigin = mapX((float) xDim * rcpWidth);
+  float yOrigin = mapY((float) yDim * rcpHeight);
+  float x = 0.0f;
+  float y = 0.0f;
+
+  float iteration = 256.f;
+
+  bool breakCond = true;
+  while (breakCond) {
+    const float xtemp = mad(-y,y,mad(x,x,xOrigin));
+    y = mad(2.f*x, y, yOrigin);
+    x = xtemp;
+    iteration -= 1.f;
+    breakCond = -mad(y,y,mad(x,x, -criterium)) * iteration > 0.f;
+  }
+
+  const uint iIteration = 256 - (uint) iteration;
+  const uint isBlack = (iIteration == 256);
+  const uint black = 255 << 24;
+  const uint nonBlack = iIteration | (iIteration << 8) | (iIteration << 16) | (255 << 24);
+  out[idx] = select(nonBlack, black, isBlack);
+}
+
diff --git a/kernels/compiler_mandelbrot_alternate_ref.bmp b/kernels/compiler_mandelbrot_alternate_ref.bmp
new file mode 100644
index 0000000..011d583
Binary files /dev/null and b/kernels/compiler_mandelbrot_alternate_ref.bmp differ
diff --git a/kernels/compiler_mandelbrot_ref.bmp b/kernels/compiler_mandelbrot_ref.bmp
new file mode 100644
index 0000000..494bf8b
Binary files /dev/null and b/kernels/compiler_mandelbrot_ref.bmp differ
diff --git a/kernels/compiler_math.cl b/kernels/compiler_math.cl
new file mode 100644
index 0000000..695fc2c
--- /dev/null
+++ b/kernels/compiler_math.cl
@@ -0,0 +1,40 @@
+__kernel void compiler_math(__global float *dst, __global float *src) {
+  int i = get_global_id(0);
+  const float x = src[i];
+  switch (i) {
+    case 0: dst[i] = cos(x); break;
+    case 1: dst[i] = sin(x); break;
+    case 2: dst[i] = log2(x); break;
+    case 3: dst[i] = sqrt(x); break;
+    case 4: dst[i] = rsqrt(x); break;
+    case 5: dst[i] = native_recip(x); break;
+    case 6: dst[i] = tan(x); break;
+    case 7: dst[i] = cbrt(x); break;
+    case 8: dst[i] = ceil(x); break;
+    case 9: dst[i] = cospi(x); break;
+    case 10: dst[i] = exp2(x); break;
+    case 11: dst[i] = exp10(x); break;
+    case 12: dst[i] = expm1(x); break;
+    case 13: dst[i] = log1p(x); break;
+    case 14: dst[i] = logb(x); break;
+    case 15: dst[i] = sinpi(x); break;
+    case 16: dst[i] = tanpi(x); break;
+    case 17: dst[i] = rint(x); break;
+    case 18: dst[i] = sinh(x); break;
+    case 19: dst[i] = cosh(x); break;
+    case 20: dst[i] = tanh(x); break;
+    case 21: dst[i] = asinh(x); break;
+    case 22: dst[i] = acosh(x); break;
+    case 23: dst[i] = atanh(x); break;
+    case 24: dst[i] = asin(x); break;
+    case 25: dst[i] = acos(x); break;
+    case 26: dst[i] = atan(x); break;
+    case 27: dst[i] = asinpi(x); break;
+    case 28: dst[i] = acospi(x); break;
+    case 29: dst[i] = atanpi(x); break;
+    case 30: dst[i] = erf(x); break;
+    case 31: dst[i] = nan((uint)x); break;
+    default: dst[i] = 1.f; break;
+  };
+}
+
diff --git a/kernels/compiler_math_2op.cl b/kernels/compiler_math_2op.cl
new file mode 100644
index 0000000..6e970b8
--- /dev/null
+++ b/kernels/compiler_math_2op.cl
@@ -0,0 +1,19 @@
+kernel void compiler_math_2op(global float *dst, global float *src1, global float *src2) {
+  int i = get_global_id(0);
+  const float x = src1[i], y = src2[i];
+  float z;
+  switch (i) {
+    case 0: dst[i] = native_divide(x, y); break;
+    case 1: dst[i] = fdim(x, y); break;
+    case 2: dst[i] = fract(x, &z); break;
+    case 3: dst[i] = hypot(x, y); break;
+    case 4: dst[i] = ldexp(x, y); break;
+    case 5: dst[i] = pown(x, (int)y); break;
+    case 6: dst[i] = remainder(x, y); break;
+    case 7: dst[i] = rootn(x, (int)(y+1)); break;
+    case 8: dst[i] = copysign(x, y); break;
+    case 9: dst[i] = maxmag(x, y); break;
+    case 10: dst[i] = minmag(x, y); break;
+    default: dst[i] = 1.f; break;
+  };
+}
diff --git a/kernels/compiler_math_3op.cl b/kernels/compiler_math_3op.cl
new file mode 100644
index 0000000..95b0398
--- /dev/null
+++ b/kernels/compiler_math_3op.cl
@@ -0,0 +1,9 @@
+kernel void compiler_math_3op(global float *dst, global float *src1, global float *src2, global float *src3) {
+  int i = get_global_id(0);
+  const float x = src1[i], y = src2[i], z = src3[i];
+  switch (i) {
+    case 0: dst[i] = mad(x, y, z); break;
+    case 1: dst[i] = fma(x, y, z); break;
+    default: dst[i] = 1.f; break;
+  };
+}
diff --git a/kernels/compiler_math_builtin.cl b/kernels/compiler_math_builtin.cl
new file mode 100644
index 0000000..d5c8392
--- /dev/null
+++ b/kernels/compiler_math_builtin.cl
@@ -0,0 +1,82 @@
+/* OpenCL 1.1 Math Built-in Functions (section 6.11.2) */
+__kernel void
+compiler_array0(__global float *src, __global float *dst)
+{
+  int p = get_global_id(0);
+  dst[p] = acos(src[p]);
+  dst[p+1] = acosh(src[p]);
+  dst[p+2] = acospi(src[p]);
+  dst[p+3] = asin(src[p]);
+  dst[p+4] = asinh(src[p]);
+  dst[p+5] = asinpi(src[p]);
+  dst[p+6] = atan(src[p]);
+  dst[p+7] = atan2(src[p], src[p+1]);
+  dst[p+8] = atanh(src[p]);
+  dst[p+9] = atanpi(src[p]);
+  dst[p+10] = atan2pi(src[p], src[p+1]);
+  dst[p+11] = cbrt(src[p]);
+  dst[p+12] = ceil(src[p]);
+  dst[p+13] = copysign(src[p], src[p+1]);
+  dst[p+14] = cos(src[p]);
+  dst[p+15] = cosh(src[p]);
+  dst[p+16] = cospi(src[p]);
+  dst[p+17] = half_divide(src[p], src[p+1]);
+  dst[p+18] = native_divide(src[p], src[p+1]);
+  dst[p+19] = erfc(src[p]);
+  dst[p+20] = erf(src[p]);
+  dst[p+21] = exp(src[p]);
+  dst[p+22] = exp2(src[p]);
+  dst[p+23] = exp10(src[p]);
+  dst[p+24] = expm1(src[p]);
+  dst[p+25] = fabs(src[p]);
+  dst[p+26] = fdim(src[p], src[p+1]);
+  dst[p+27] = floor(src[p]);
+  dst[p+28] = fma(src[p], src[p+1], src[p+2]);
+  dst[p+29] = fmax(src[p], src[p+1]);
+  dst[p+30] = fmin(src[p]);
+  dst[p+31] = fmod(src[p], src[p+1]);
+  __local float iptr[4];
+  dst[p+32] = fract(src[p], iptr);
+  __private int exps[4];
+  dst[p+33] = frexp(src[p], exps);
+  dst[p+34] = hypot(src[p], src[p+1]);
+  dst[p+35] = (float)ilogb(src[p]);
+  dst[p+36] = ldexp(src[p], 10);
+  dst[p+37] = lgamma(src[p]);
+  __local int signp[4];
+  dst[p+38] = lgamma_r(src[p], signp);
+  dst[p+39] = log(src[p]);
+  dst[p+40] = log2(src[p]);
+  dst[p+41] = log10(src[p]);
+  dst[p+42] = log1p(src[p]);
+  dst[p+43] = logb(src[p]);
+  dst[p+44] = mad(src[p], src[p+1], src[p+2]);
+  dst[p+45] = maxmag(src[p], src[p+1]);
+  dst[p+46] = minmag(src[p], src[p+1]);
+  dst[p+47] = modf(src[p], iptr);
+  dst[p+48] = nan((ulong)src[p]);
+  dst[p+49] = nextafter(src[p], src[p+1]);
+  dst[p+50] = pow(src[p], src[p+1]);
+  dst[p+51] = pown(src[p], (int)src[p+1]);
+  dst[p+52] = powr(src[p], src[p+1]);
+  dst[p+53] = half_recip((half)src[p]) + native_recip(src[p]);
+  dst[p+54] = remainder(src[p], src[p+1]);
+  __private int quo[4];
+  dst[p+55] = remquo(src[p], quo);
+  dst[p+56] = rint(src[p]);
+  dst[p+57] = rootn(src[p], 10);
+  dst[p+58] = round(src[p]);
+  dst[p+59] = rsqrt(src[p]);
+  dst[p+60] = sin(src[p]);
+  __local float cosval;
+  dst[p+61] = sincos(src[p], &cosval);
+  dst[p+62] = sinh(src[p]);
+  dst[p+63] = sinpi(src[p]);
+  dst[p+64] = sqrt(src[p]);
+  dst[p+65] = tan(src[p]);
+  dst[p+66] = tanh(src[p]);
+  dst[p+67] = tanpi(src[p]);
+  dst[p+68] = tgamma(src[p]);
+  dst[p+69] = trunc(src[p]);
+}
+
diff --git a/kernels/compiler_math_constants.cl b/kernels/compiler_math_constants.cl
new file mode 100644
index 0000000..4979cf2
--- /dev/null
+++ b/kernels/compiler_math_constants.cl
@@ -0,0 +1,23 @@
+/* test case for OpenCL 1.1 Math Constants (section 6.11.2) */
+__kernel void compiler_math_constants()
+{
+  float f;
+  f = MAXFLOAT;
+  f = HUGE_VALF;
+  f = HUGE_VAL;
+  f = INFINITY;
+  f = NAN;
+  f = M_E_F;
+  f = M_LOG2E_F;
+  f = M_LOG10E_F;
+  f = M_LN2_F;
+  f = M_LN10_F;
+  f = M_PI_F;
+  f = M_PI_2_F;
+  f = M_PI_4_F;
+  f = M_1_PI_F;
+  f = M_2_PI_F;
+  f = M_2_SQRTPI_F;
+  f = M_SQRT2_F;
+  f = M_SQRT1_2_F;
+}
diff --git a/kernels/compiler_mem_fence.cl b/kernels/compiler_mem_fence.cl
new file mode 100644
index 0000000..c17985e
--- /dev/null
+++ b/kernels/compiler_mem_fence.cl
@@ -0,0 +1,10 @@
+kernel void compiler_mem_fence() {
+  barrier(CLK_LOCAL_MEM_FENCE);
+  barrier(CLK_GLOBAL_MEM_FENCE);
+  mem_fence(CLK_LOCAL_MEM_FENCE);
+  mem_fence(CLK_GLOBAL_MEM_FENCE);
+  read_mem_fence(CLK_LOCAL_MEM_FENCE);
+  read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+  write_mem_fence(CLK_LOCAL_MEM_FENCE);
+  write_mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
diff --git a/kernels/compiler_mixed_pointer.cl b/kernels/compiler_mixed_pointer.cl
new file mode 100644
index 0000000..78c5783
--- /dev/null
+++ b/kernels/compiler_mixed_pointer.cl
@@ -0,0 +1,23 @@
+
+kernel void compiler_mixed_pointer(__global uint* src1, __global uint *src2, __global uint *dst) {
+  int x = get_global_id(0);
+  global uint * tmp = NULL;
+
+  switch(x) {
+    case 0:
+    case 1:
+    case 4:
+      tmp = src1;
+      break;
+    default:
+      tmp = src2;
+      break;
+  }
+  dst[x] = tmp[x];
+}
+
+kernel void compiler_mixed_pointer1(__global uint* src, __global uint *dst1, __global uint *dst2) {
+  int x = get_global_id(0);
+  global uint * tmp = x < 5 ? dst1 : dst2;
+  tmp[x] = src[x];
+}
diff --git a/kernels/compiler_mul24.cl b/kernels/compiler_mul24.cl
new file mode 100644
index 0000000..b69dda0
--- /dev/null
+++ b/kernels/compiler_mul24.cl
@@ -0,0 +1,4 @@
+kernel void compiler_mul24(global int *src1, global int *src2, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = mul24(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_mul_hi.cl b/kernels/compiler_mul_hi.cl
new file mode 100644
index 0000000..28ce0a5
--- /dev/null
+++ b/kernels/compiler_mul_hi.cl
@@ -0,0 +1,4 @@
+kernel void compiler_mul_hi(global int *src1, global int *src2, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = mul_hi(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_multiple_kernels.cl b/kernels/compiler_multiple_kernels.cl
new file mode 100644
index 0000000..d5cea68
--- /dev/null
+++ b/kernels/compiler_multiple_kernels.cl
@@ -0,0 +1,7 @@
+__kernel void first_kernel(void)
+{
+}
+
+__kernel void second_kernel(void)
+{
+}
\ No newline at end of file
diff --git a/kernels/compiler_obread.cl b/kernels/compiler_obread.cl
new file mode 100644
index 0000000..14658d9
--- /dev/null
+++ b/kernels/compiler_obread.cl
@@ -0,0 +1,8 @@
+__kernel void
+compiler_obread(__global uint *src, __global uint *dst)
+{
+  int id = (int)get_global_id(0);
+  const int to =  __gen_ocl_obread(src+id);
+  dst[id] = to;
+}
+
diff --git a/kernels/compiler_obwrite.cl b/kernels/compiler_obwrite.cl
new file mode 100644
index 0000000..50e55a1
--- /dev/null
+++ b/kernels/compiler_obwrite.cl
@@ -0,0 +1,8 @@
+__kernel void
+compiler_obwrite(__global uint *src, __global uint *dst)
+{
+  int id = (int)get_global_id(0);
+  const int to =  src[id];
+  __gen_ocl_obwrite(dst+id,to);
+}
+
diff --git a/kernels/compiler_preprocessor_macros.cl b/kernels/compiler_preprocessor_macros.cl
new file mode 100644
index 0000000..0f23b3f
--- /dev/null
+++ b/kernels/compiler_preprocessor_macros.cl
@@ -0,0 +1,13 @@
+/* test case for OpenCL 1.1 Preprocessor Directives & Macros (section 6.9) */
+__kernel_exec(1, float4) void compiler_preprocessor_macros()
+{
+#pragma OPENCL FP_CONTRACT ON
+#pragma OPENCL FP_CONTRACT OFF
+#pragma OPENCL FP_CONTRACT DEFAULT
+  int i = __OPENCL_VERSION__;
+  i = __CL_VERSION_1_0__;
+  i = __CL_VERSION_1_1__;
+  i = __ENDIAN_LITTLE__;
+  i = __IMAGE_SUPPORT__;
+  i = __FAST_RELAXED_MATH__;
+}
diff --git a/kernels/compiler_private_data_overflow.cl b/kernels/compiler_private_data_overflow.cl
new file mode 100644
index 0000000..d0f557d
--- /dev/null
+++ b/kernels/compiler_private_data_overflow.cl
@@ -0,0 +1,10 @@
+kernel void compiler_private_data_overflow( __global int4 *output )
+{
+	int4 data[65];
+	for( int i=0; i<65; ++i )
+	{
+		data[i] = (int4)i;
+	}
+	if( get_global_id(0) == 1 )
+		*output = data[0];
+}
diff --git a/kernels/compiler_radians.cl b/kernels/compiler_radians.cl
new file mode 100644
index 0000000..1f79481
--- /dev/null
+++ b/kernels/compiler_radians.cl
@@ -0,0 +1,4 @@
+kernel void compiler_radians(global float *src, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = radians(src[i]);
+}
diff --git a/kernels/compiler_region.cl b/kernels/compiler_region.cl
new file mode 100644
index 0000000..d74ac7d
--- /dev/null
+++ b/kernels/compiler_region.cl
@@ -0,0 +1,10 @@
+__kernel void
+compiler_region(__global uint *src, __global uint *dst)
+{
+  __gen_ocl_force_simd16();
+  int id = (int)get_global_id(0);
+  const int x0 = src[id];
+  const int x1 = src[id+16];
+  dst[id] = __gen_ocl_region(0, 16, 8, 2, x0, x1);
+}
+
diff --git a/kernels/compiler_region0.cl b/kernels/compiler_region0.cl
new file mode 100644
index 0000000..5bd57c0
--- /dev/null
+++ b/kernels/compiler_region0.cl
@@ -0,0 +1,11 @@
+__kernel void
+compiler_region0(__global uint *src, __global uint *dst)
+{
+  __gen_ocl_force_simd16();
+  int id = (int)get_global_id(0);
+  const int x0 = src[id];
+  const int x1 = src[id+16];
+  const int x2 = src[id+32];
+  dst[id] = __gen_ocl_region(1, 16, 8, 2, x0, x1, x2);
+}
+
diff --git a/kernels/compiler_region1.cl b/kernels/compiler_region1.cl
new file mode 100644
index 0000000..9deb63c
--- /dev/null
+++ b/kernels/compiler_region1.cl
@@ -0,0 +1,9 @@
+__kernel void
+compiler_region1(__global uint *src, __global uint *dst)
+{
+  __gen_ocl_force_simd16();
+  int id = (int)get_global_id(0);
+  const int x0 = src[id];
+  dst[id] = __gen_ocl_region(0, 16, 8, 2, x0);
+}
+
diff --git a/kernels/compiler_relational_builtin.cl b/kernels/compiler_relational_builtin.cl
new file mode 100644
index 0000000..8b195ca
--- /dev/null
+++ b/kernels/compiler_relational_builtin.cl
@@ -0,0 +1,24 @@
+/* test OpenCL 1.1 Relational Built-in Functions (section 6.11.6) */
+kernel void compiler_relational_builtin() {
+  float x = 1, y = 2, z = 3;
+  int i;
+  i = isequal(x, y);
+  i = isnotequal(x, y);
+  i = isgreater(x, y);
+  i = isgreaterequal(x, y);
+  i = isless(x, y);
+  i = islessequal(x, y);
+  i = islessgreater(x, y);
+  i = isfinite(x);
+  i = isinf(x);
+  i = isnan(x);
+  i = isnormal(x);
+  i = isordered(x, y);
+  i = isunordered(x, y);
+  i = signbit(x);
+  long l = 12;
+  i = any(l);
+  i = all(l);
+  bitselect(x, y, z);
+  select(x, y, z);
+}
diff --git a/kernels/compiler_rhadd.cl b/kernels/compiler_rhadd.cl
new file mode 100644
index 0000000..4024ace
--- /dev/null
+++ b/kernels/compiler_rhadd.cl
@@ -0,0 +1,4 @@
+kernel void compiler_rhadd(global int *src1, global int *src2, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = rhadd(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_rotate.cl b/kernels/compiler_rotate.cl
new file mode 100644
index 0000000..8d0dd0f
--- /dev/null
+++ b/kernels/compiler_rotate.cl
@@ -0,0 +1,5 @@
+kernel void compiler_rotate(global int *src, global int *dst, global int *y) {
+  int i = get_global_id(0);
+  dst[i] = rotate(src[i], y[i]);
+}
+
diff --git a/kernels/compiler_sampler.cl b/kernels/compiler_sampler.cl
new file mode 100644
index 0000000..149bbf1
--- /dev/null
+++ b/kernels/compiler_sampler.cl
@@ -0,0 +1,25 @@
+/* test OpenCL 1.1 sampler declaration */
+__kernel void compiler_sampler () {
+#define S(A,B,C) CLK_NORMALIZED_COORDS_##A | CLK_ADDRESS_##B | CLK_FILTER_##C
+  const sampler_t \
+    s0 = S(TRUE,REPEAT,NEAREST),
+    s1 = S(TRUE,REPEAT,LINEAR),
+    s2 = S(TRUE,CLAMP,NEAREST),
+    s3 = S(TRUE,CLAMP,LINEAR),
+    s4 = S(TRUE,NONE,NEAREST),
+    s5 = S(TRUE,NONE,LINEAR),
+    s6 = S(TRUE,CLAMP_TO_EDGE,NEAREST),
+    s7 = S(TRUE,CLAMP_TO_EDGE,LINEAR),
+    s8 = S(TRUE,MIRRORED_REPEAT,NEAREST),
+    s9 = S(TRUE,MIRRORED_REPEAT,LINEAR),
+    s10 = S(FALSE,REPEAT,NEAREST),
+    s11 = S(FALSE,REPEAT,LINEAR),
+    s12 = S(FALSE,CLAMP,NEAREST),
+    s13 = S(FALSE,CLAMP,LINEAR),
+    s14 = S(FALSE,NONE,NEAREST),
+    s15 = S(FALSE,NONE,LINEAR),
+    s16 = S(FALSE,CLAMP_TO_EDGE,NEAREST),
+    s17 = S(FALSE,CLAMP_TO_EDGE,LINEAR),
+    s18 = S(FALSE,MIRRORED_REPEAT,NEAREST),
+    s19 = S(FALSE,MIRRORED_REPEAT,LINEAR);
+}
diff --git a/kernels/compiler_saturate.cl b/kernels/compiler_saturate.cl
new file mode 100644
index 0000000..e9ffc4b
--- /dev/null
+++ b/kernels/compiler_saturate.cl
@@ -0,0 +1,16 @@
+#define TEST_TYPE(TYPE)                                                           \
+__kernel void test_##TYPE(__global TYPE *C, __global TYPE *A, __global TYPE *B) { \
+  int id = get_global_id(0);                                                      \
+  C[id] = add_sat(A[id], B[id]);                                                  \
+}
+
+TEST_TYPE(char)
+TEST_TYPE(uchar)
+TEST_TYPE(short)
+TEST_TYPE(ushort)
+TEST_TYPE(int)
+TEST_TYPE(uint)
+//TEST_TYPE(long)
+//TEST_TYPE(ulong)
+
+#undef TEST_TYPE
diff --git a/kernels/compiler_saturate_sub.cl b/kernels/compiler_saturate_sub.cl
new file mode 100644
index 0000000..e20a76f
--- /dev/null
+++ b/kernels/compiler_saturate_sub.cl
@@ -0,0 +1,16 @@
+#define TEST_TYPE(TYPE)                                                           \
+__kernel void test_##TYPE(__global TYPE *C, __global TYPE *A, __global TYPE *B) { \
+  int id = get_global_id(0);                                                      \
+  C[id] = sub_sat(A[id], B[id]);                                                  \
+}
+
+TEST_TYPE(char)
+TEST_TYPE(uchar)
+TEST_TYPE(short)
+TEST_TYPE(ushort)
+TEST_TYPE(int)
+TEST_TYPE(uint)
+//TEST_TYPE(long)
+//TEST_TYPE(ulong)
+
+#undef TEST_TYPE
diff --git a/kernels/compiler_shift_right.cl b/kernels/compiler_shift_right.cl
new file mode 100644
index 0000000..c109170
--- /dev/null
+++ b/kernels/compiler_shift_right.cl
@@ -0,0 +1,4 @@
+kernel void compiler_shift_right(global uint *src, global int *dst) {
+    int i = get_global_id(0);
+    dst[i] = src[i] >> 24;
+}
diff --git a/kernels/compiler_short_scatter.cl b/kernels/compiler_short_scatter.cl
new file mode 100644
index 0000000..7dad029
--- /dev/null
+++ b/kernels/compiler_short_scatter.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_short_scatter(__global short *dst)
+{
+  int id = (int) get_global_id(0);
+  dst[id] = (short) id;
+}
+
diff --git a/kernels/compiler_simd_all.cl b/kernels/compiler_simd_all.cl
new file mode 100644
index 0000000..504710b
--- /dev/null
+++ b/kernels/compiler_simd_all.cl
@@ -0,0 +1,12 @@
+__kernel void compiler_simd_all(global int *src, global int *dst)
+{
+  int i = get_global_id(0);
+  if (i % 2 == 1) {
+    if (__gen_ocl_simd_all((src[i] < 12) && (src[i] > 0)))
+      dst[i] = 1;
+    else
+      dst[i] = 2;
+  }
+  else
+    dst[i] = 3;
+}
diff --git a/kernels/compiler_simd_any.cl b/kernels/compiler_simd_any.cl
new file mode 100644
index 0000000..3b04f82
--- /dev/null
+++ b/kernels/compiler_simd_any.cl
@@ -0,0 +1,15 @@
+__kernel void compiler_simd_any(global int *src, global int *dst)
+{
+  int i = get_global_id(0);
+
+  if (i % 2 == 1) {
+    if (__gen_ocl_simd_any(src[i] == 5) || __gen_ocl_simd_any(src[i] == 9))
+      dst[i] = 1;
+    else if (__gen_ocl_simd_any(src[i] == 6))
+      dst[i] = 0;
+    else
+      dst[i] = 2;
+  }
+  else
+    dst[i] = 3;
+}
diff --git a/kernels/compiler_smoothstep.cl b/kernels/compiler_smoothstep.cl
new file mode 100644
index 0000000..d3b7da4
--- /dev/null
+++ b/kernels/compiler_smoothstep.cl
@@ -0,0 +1,4 @@
+kernel void compiler_smoothstep(global float *src1, global float *src2, global float *src3, global float *dst) {
+  int i = get_global_id(0);
+  dst[i] = smoothstep(src1[i], src2[i], src3[i]);
+}
diff --git a/kernels/compiler_step.cl b/kernels/compiler_step.cl
new file mode 100644
index 0000000..ef77f05
--- /dev/null
+++ b/kernels/compiler_step.cl
@@ -0,0 +1,38 @@
+#define COMPILER_STEP_FUNC_N(TYPE, N) \
+    kernel void compiler_step_##TYPE##N ( \
+           global TYPE##N* edge, global TYPE##N* x, global TYPE##N* dst) { \
+        int i = get_global_id(0); \
+        dst[i] = step(edge[i], x[i]);     \
+    }
+
+kernel void compiler_step_float (global float* edge,
+                                 global float* x, global float* dst)
+{
+    int i = get_global_id(0);
+    dst[i] = step(edge[i], x[i]);
+}
+
+COMPILER_STEP_FUNC_N(float, 2)
+COMPILER_STEP_FUNC_N(float, 3)
+COMPILER_STEP_FUNC_N(float, 4)
+COMPILER_STEP_FUNC_N(float, 8)
+COMPILER_STEP_FUNC_N(float, 16)
+
+#define COMPILER_STEPF_FUNC_N(TYPE, N) \
+    kernel void compiler_stepf_##TYPE##N ( \
+           float edge, global TYPE##N* x, global TYPE##N* dst) { \
+        int i = get_global_id(0); \
+        dst[i] = step(edge, x[i]);     \
+    }
+
+kernel void compiler_stepf_float (float edge, global float* x, global float* dst)
+{
+    int i = get_global_id(0);
+    dst[i] = step(edge, x[i]);
+}
+
+COMPILER_STEPF_FUNC_N(float, 2)
+COMPILER_STEPF_FUNC_N(float, 3)
+COMPILER_STEPF_FUNC_N(float, 4)
+COMPILER_STEPF_FUNC_N(float, 8)
+COMPILER_STEPF_FUNC_N(float, 16)
diff --git a/kernels/compiler_structure_attributes.cl b/kernels/compiler_structure_attributes.cl
new file mode 100644
index 0000000..a07dd88
--- /dev/null
+++ b/kernels/compiler_structure_attributes.cl
@@ -0,0 +1,17 @@
+#define X(x, y) x ## y
+#define NAME(x, y) X(x, y)
+#define S struct NAME(s, __LINE__) { \
+  char c;  \
+  int i;   \
+  float f; \
+}
+
+S __attribute__((aligned(16)));
+S __attribute__((aligned));
+S __attribute__((packed));
+S __attribute__((endian(host)));
+S __attribute__((endian(device)));
+S __attribute__((endian));
+
+__kernel void compiler_structure_attributes() {
+}
diff --git a/kernels/compiler_switch.cl b/kernels/compiler_switch.cl
new file mode 100644
index 0000000..c28b431
--- /dev/null
+++ b/kernels/compiler_switch.cl
@@ -0,0 +1,14 @@
+__kernel void compiler_switch(__global int *dst, __global int *src)
+{
+  switch (get_global_id(0)) {
+    case 0: dst[get_global_id(0)] = src[get_global_id(0) + 4]; break;
+    case 1: dst[get_global_id(0)] = src[get_global_id(0) + 14]; break;
+    case 2: dst[get_global_id(0)] = src[get_global_id(0) + 13]; break;
+    case 6: dst[get_global_id(0)] = src[get_global_id(0) + 11]; break;
+    case 7: dst[get_global_id(0)] = src[get_global_id(0) + 10]; break;
+    case 10: dst[get_global_id(0)] = src[get_global_id(0) + 9]; break;
+    case 12: dst[get_global_id(0)] = src[get_global_id(0) + 6]; break;
+    default: dst[get_global_id(0)] = src[get_global_id(0) + 8]; break;
+  }
+}
+
diff --git a/kernels/compiler_type_casting.cl b/kernels/compiler_type_casting.cl
new file mode 100644
index 0000000..3cdb925
--- /dev/null
+++ b/kernels/compiler_type_casting.cl
@@ -0,0 +1,19 @@
+/* test OpenCL 1.1 Conversions & Type Casting Examples (section 6.2) */
+__kernel void compiler_type_casting() {
+  float f = 1.23456789f;
+  float g;
+
+  g = (float)f;
+  g = convert_float(f);
+  g = as_float(f);
+  
+  g = convert_float_rte(f);
+  g = convert_float_rtz(f);
+  g = convert_float_rtp(f);
+  g = convert_float_rtn(f);
+
+  g = convert_float_sat_rte(f);
+  g = convert_float_sat_rtz(f);
+  g = convert_float_sat_rtp(f);
+  g = convert_float_sat_rtn(f);
+}
diff --git a/kernels/compiler_uint16_copy.cl b/kernels/compiler_uint16_copy.cl
new file mode 100644
index 0000000..1072234
--- /dev/null
+++ b/kernels/compiler_uint16_copy.cl
@@ -0,0 +1,8 @@
+__kernel void
+compiler_uint16_copy(__global uint16 *src, __global uint16 *dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = src[id];
+}
+
+
diff --git a/kernels/compiler_uint2_copy.cl b/kernels/compiler_uint2_copy.cl
new file mode 100644
index 0000000..7c5c5e3
--- /dev/null
+++ b/kernels/compiler_uint2_copy.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_uint2_copy(__global uint2 *src, __global uint2 *dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = src[id];
+}
+
diff --git a/kernels/compiler_uint3_copy.cl b/kernels/compiler_uint3_copy.cl
new file mode 100644
index 0000000..7dc71b2
--- /dev/null
+++ b/kernels/compiler_uint3_copy.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_uint3_copy(__global uint3 *src, __global uint3 *dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = src[id];
+}
+
diff --git a/kernels/compiler_uint3_unaligned_copy.cl b/kernels/compiler_uint3_unaligned_copy.cl
new file mode 100644
index 0000000..a50f0ab
--- /dev/null
+++ b/kernels/compiler_uint3_unaligned_copy.cl
@@ -0,0 +1,8 @@
+__kernel void
+compiler_uint3_unaligned_copy(__global uint *src, __global uint *dst)
+{
+  const int id = (int)get_global_id(0);
+  const uint3 from = vload3(id, src);
+  vstore3(from, id, dst);
+}
+
diff --git a/kernels/compiler_uint8_copy.cl b/kernels/compiler_uint8_copy.cl
new file mode 100644
index 0000000..9eee538
--- /dev/null
+++ b/kernels/compiler_uint8_copy.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_uint8_copy(__global uint8 *src, __global uint8 *dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = src[id];
+}
+
diff --git a/kernels/compiler_unstructured_branch0.cl b/kernels/compiler_unstructured_branch0.cl
new file mode 100644
index 0000000..66da6e0
--- /dev/null
+++ b/kernels/compiler_unstructured_branch0.cl
@@ -0,0 +1,14 @@
+__kernel void
+compiler_unstructured_branch0(__global int *src, __global int *dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = src[id];
+  if (dst[id] >= 0) goto label;
+
+  do {
+    dst[id] = 1;
+  label:
+    id += get_local_size(0);
+  } while (id < 32);
+}
+
diff --git a/kernels/compiler_unstructured_branch1.cl b/kernels/compiler_unstructured_branch1.cl
new file mode 100644
index 0000000..fb937e0
--- /dev/null
+++ b/kernels/compiler_unstructured_branch1.cl
@@ -0,0 +1,14 @@
+__kernel void
+compiler_unstructured_branch1(__global int *src, __global int *dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = src[id];
+  if (dst[id] >= 0) goto label1;
+  dst[id] = 1;
+  if (src[id] <= 2) goto label2;
+  label1:
+  dst[id] -= 2;
+  label2:
+  dst[id] += 2;
+}
+
diff --git a/kernels/compiler_unstructured_branch2.cl b/kernels/compiler_unstructured_branch2.cl
new file mode 100644
index 0000000..546f253
--- /dev/null
+++ b/kernels/compiler_unstructured_branch2.cl
@@ -0,0 +1,18 @@
+__kernel void
+compiler_unstructured_branch2(__global int *src, __global int *dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = src[id];
+  if (dst[id] < 0) goto label1;
+  dst[id] = 1;
+  if (dst[id] > src[id]) goto label3;
+  dst[id]++;
+  if (src[id] <= 2) goto label2;
+  label1:
+  dst[id] -= 2;
+  label2:
+  dst[id] += 2;
+  label3:
+  dst[id] *= 3;
+}
+
diff --git a/kernels/compiler_unstructured_branch3.cl b/kernels/compiler_unstructured_branch3.cl
new file mode 100644
index 0000000..67b4761
--- /dev/null
+++ b/kernels/compiler_unstructured_branch3.cl
@@ -0,0 +1,16 @@
+__kernel void
+compiler_unstructured_branch3(__global int *src, __global int *dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = src[id];
+  if (dst[id] >= 2) goto label1;
+  dst[id] = 1;
+  if (src[id] < 2) goto label2;
+  dst[id]--;
+  label1:
+  dst[id] -= 2;
+  label2:
+  dst[id] += 2;
+}
+
+
diff --git a/kernels/compiler_upsample_int.cl b/kernels/compiler_upsample_int.cl
new file mode 100644
index 0000000..d7945b5
--- /dev/null
+++ b/kernels/compiler_upsample_int.cl
@@ -0,0 +1,4 @@
+kernel void compiler_upsample_int(global short *src1, global ushort *src2, global int *dst) {
+  int i = get_global_id(0);
+  dst[i] = upsample(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_upsample_long.cl b/kernels/compiler_upsample_long.cl
new file mode 100644
index 0000000..8f914e4
--- /dev/null
+++ b/kernels/compiler_upsample_long.cl
@@ -0,0 +1,4 @@
+kernel void compiler_upsample_long(global int *src1, global uint *src2, global long *dst) {
+  int i = get_global_id(0);
+  dst[i] = upsample(src1[i], src2[i]);
+}
diff --git a/kernels/compiler_vect_compare.cl b/kernels/compiler_vect_compare.cl
new file mode 100644
index 0000000..ae43ec6
--- /dev/null
+++ b/kernels/compiler_vect_compare.cl
@@ -0,0 +1,7 @@
+__kernel void
+compiler_vect_compare(__global int4 *src, __global int4 *dst)
+{
+  int4 test = (int4)(0,0,0,0);
+
+  dst[get_global_id(0)] = test < src[get_global_id(0)];
+}
diff --git a/kernels/compiler_vector_inc.cl b/kernels/compiler_vector_inc.cl
new file mode 100644
index 0000000..548dcb4
--- /dev/null
+++ b/kernels/compiler_vector_inc.cl
@@ -0,0 +1,13 @@
+kernel void compiler_vector_inc(global char *dst, global char *src) {
+    size_t i = get_global_id(0);
+    char2 dst2 = vload2(i, dst);
+    if (src[i] == 0)
+      dst2++;
+    else if(src[i] == 1)
+      ++dst2;
+    else if(src[i] == 2)
+      dst2--;
+    else
+      --dst2;
+    vstore2(dst2, i, dst);
+}
diff --git a/kernels/compiler_vector_load_store.cl b/kernels/compiler_vector_load_store.cl
new file mode 100644
index 0000000..aec38b1
--- /dev/null
+++ b/kernels/compiler_vector_load_store.cl
@@ -0,0 +1,40 @@
+/* test OpenCL 1.1 Vector Data Load/Store Functions (section 6.11.7) */
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#define OFFSET2(type)  (type ##2) {(type)1, (type)2}
+#define OFFSET3(type)  (type ##3) {(type)1, (type)2, (type)3}
+#define OFFSET4(type)  (type ##4) {(type)1, (type)2, (type)3, (type)4}
+#define OFFSET8(type)  (type ##8) {(type)1, (type)2, (type)3, (type)4, (type)5, (type)6, (type)7, (type)8}
+#define OFFSET16(type)  (type ##16)  {(type)1, (type)2, (type)3, (type)4, (type)5, (type)6, (type)7, (type)8, (type)9, (type)10, (type)11, (type)12, (type)13, (type)14, (type)15, (type)16}
+
+#define  TEST_TYPE(type, n) \
+__kernel void test_##type ##n(__global type *pin, \
+                            __global type *pout)  \
+{\
+  int x = get_global_id(0); \
+  type ##n value; \
+  value = vload ##n(x, pin); \
+  value += OFFSET ##n(type); \
+  vstore ##n(value, x, pout); \
+}
+
+#define TEST_ALL_TYPE(n) \
+  TEST_TYPE(char,n)  \
+  TEST_TYPE(uchar,n) \
+  TEST_TYPE(short,n) \
+  TEST_TYPE(ushort,n)\
+  TEST_TYPE(int,n)   \
+  TEST_TYPE(uint,n)  \
+  TEST_TYPE(float,n) \
+  TEST_TYPE(long,n)  \
+  TEST_TYPE(ulong,n)
+//  TEST_TYPE(double,n)
+
+#if 0
+  TEST_TYPE(half,n)
+#endif
+
+TEST_ALL_TYPE(2)
+TEST_ALL_TYPE(3)
+TEST_ALL_TYPE(4)
+TEST_ALL_TYPE(8)
+TEST_ALL_TYPE(16)
diff --git a/kernels/compiler_volatile.cl b/kernels/compiler_volatile.cl
new file mode 100644
index 0000000..84f7228
--- /dev/null
+++ b/kernels/compiler_volatile.cl
@@ -0,0 +1,4 @@
+__kernel void compiler_volatile(__global int *dst, __local volatile int *hop) {
+  hop[get_global_id(0)] = get_local_id(1);
+  dst[get_global_id(0)] = hop[get_local_id(0)];
+}
diff --git a/kernels/compiler_vote_all.cl b/kernels/compiler_vote_all.cl
new file mode 100644
index 0000000..1918c1c
--- /dev/null
+++ b/kernels/compiler_vote_all.cl
@@ -0,0 +1,10 @@
+__kernel void
+compiler_vote_all(__global uint *src, __global uint *dst)
+{
+  int id = (int)get_global_id(0);
+  if (__gen_ocl_all(id > 8))
+    dst[id] = src[id];
+  else
+    dst[id] = 0;
+}
+
diff --git a/kernels/compiler_vote_any.cl b/kernels/compiler_vote_any.cl
new file mode 100644
index 0000000..0a81e89
--- /dev/null
+++ b/kernels/compiler_vote_any.cl
@@ -0,0 +1,10 @@
+__kernel void
+compiler_vote_any(__global uint *src, __global uint *dst)
+{
+  int id = (int)get_global_id(0);
+  if (__gen_ocl_any(id > 6))
+    dst[id] = src[id];
+  else
+    dst[id] = 0;
+}
+
diff --git a/kernels/compiler_workitem_builtin.cl b/kernels/compiler_workitem_builtin.cl
new file mode 100644
index 0000000..b01dd7d
--- /dev/null
+++ b/kernels/compiler_workitem_builtin.cl
@@ -0,0 +1,12 @@
+/* test case for OpenCL 1.1 work-item built-in functions */
+__kernel void compiler_workitem_builtin()
+{
+  uint x = get_work_dim();
+  size_t y = get_global_size(0);
+  y = get_global_id(0);
+  y = get_local_size(0);
+  y = get_local_id(0);
+  y = get_num_groups(0);
+  y = get_group_id(0);
+  y = get_global_offset(0);
+}
diff --git a/kernels/compiler_write_only_bytes.cl b/kernels/compiler_write_only_bytes.cl
new file mode 100644
index 0000000..555a9dc
--- /dev/null
+++ b/kernels/compiler_write_only_bytes.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_write_only_bytes(__global char *dst)
+{
+    int id = (int)get_global_id(0);
+    dst[id] = 2;
+}
diff --git a/kernels/compiler_write_only_shorts.cl b/kernels/compiler_write_only_shorts.cl
new file mode 100644
index 0000000..205634d
--- /dev/null
+++ b/kernels/compiler_write_only_shorts.cl
@@ -0,0 +1,6 @@
+__kernel void
+compiler_write_only_shorts(__global short *dst)
+{
+    int id = (int)get_global_id(0);
+    dst[id] = 2;
+}
diff --git a/kernels/double_precision_check.cl b/kernels/double_precision_check.cl
new file mode 100644
index 0000000..e55cafa
--- /dev/null
+++ b/kernels/double_precision_check.cl
@@ -0,0 +1,11 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+__kernel void
+double_precision_check(__global float* src, __global float* dst)
+{
+  int id = (int)get_global_id(0);
+  double d0 = 0.12345678912345678 + src[1];
+  double d1 = 0.12355678922345678 + src[0];
+  float rem = d1 - d0;
+  dst[id] = rem;
+}
diff --git a/kernels/empty.cl b/kernels/empty.cl
new file mode 100644
index 0000000..fd6f298
--- /dev/null
+++ b/kernels/empty.cl
@@ -0,0 +1 @@
+__kernel void empty() {}
diff --git a/kernels/image_1D_buffer.cl b/kernels/image_1D_buffer.cl
new file mode 100644
index 0000000..e8e0a86
--- /dev/null
+++ b/kernels/image_1D_buffer.cl
@@ -0,0 +1,13 @@
+__kernel void image_1D_buffer(image1d_buffer_t image1, image1d_t image2, sampler_t sampler, __global int *results)
+{
+   int x = get_global_id(0);
+   int offset = x;
+
+   int4 col = read_imagei(image1, x);
+   int4 test = (col != read_imagei(image2, sampler, x));
+
+   if (test.x || test.y || test.z || test.w)
+      results[offset] = 0;
+   else
+      results[offset] = 1;
+}
diff --git a/kernels/include/runtime_compile_link_inc.h b/kernels/include/runtime_compile_link_inc.h
new file mode 100644
index 0000000..9b66850
--- /dev/null
+++ b/kernels/include/runtime_compile_link_inc.h
@@ -0,0 +1,4 @@
+int greater(long x, long y)
+{
+  return x > y ;
+}
diff --git a/kernels/my_test.cl b/kernels/my_test.cl
new file mode 100644
index 0000000..91f1821
--- /dev/null
+++ b/kernels/my_test.cl
@@ -0,0 +1,26 @@
+__kernel void
+my_test(__global int2 *src, __global int *offsets, __global uint2 *dst, int w)
+{
+	int i, index, j;
+	uint2 out;
+	unsigned int a, b, c, d;
+	int2 rle;
+	int gid = get_global_id(0);
+	index = offsets[gid];
+	int i0 = 0;
+	rle = src[index];
+	for (i = 0; i < w; i++, i0 += 8) {
+			if (i0+0 >= rle.x) { index++; rle = src[index]; } a = rle.y;
+			if (i0+1 >= rle.x) { index++; rle = src[index]; } b = rle.y;
+			if (i0+2 >= rle.x) { index++; rle = src[index]; } c = rle.y;
+			if (i0+3 >= rle.x) { index++; rle = src[index]; } d = rle.y;
+			out.x = (d<<24)|(c<<16)|(b<<8)|(a);
+			if (i0+4 >= rle.x) { index++; rle = src[index]; } a = rle.y;
+			if (i0+5 >= rle.x) { index++; rle = src[index]; } b = rle.y;
+			if (i0+6 >= rle.x) { index++; rle = src[index]; } c = rle.y;
+			if (i0+7 >= rle.x) { index++; rle = src[index]; } d = rle.y;
+			out.y = (d<<24)|(c<<16)|(b<<8)|(a);
+
+		dst[gid*w + i] = out;
+	}
+}
diff --git a/kernels/null_kernel_arg.cl b/kernels/null_kernel_arg.cl
new file mode 100644
index 0000000..68a4280
--- /dev/null
+++ b/kernels/null_kernel_arg.cl
@@ -0,0 +1,9 @@
+__kernel void
+null_kernel_arg(__global unsigned int *dst, __global unsigned int * mask_global, __constant unsigned int* mask_const)
+{
+  if(dst && mask_global==0 && mask_const == NULL)
+  {
+    uint idx = (uint)get_global_id(0);
+    dst[idx] = idx;
+  }
+}
diff --git a/kernels/runtime_compile_link.h b/kernels/runtime_compile_link.h
new file mode 100644
index 0000000..ae2c56e
--- /dev/null
+++ b/kernels/runtime_compile_link.h
@@ -0,0 +1 @@
+int comp_long(long x, long y);
diff --git a/kernels/runtime_compile_link_a.cl b/kernels/runtime_compile_link_a.cl
new file mode 100644
index 0000000..b17861f
--- /dev/null
+++ b/kernels/runtime_compile_link_a.cl
@@ -0,0 +1,13 @@
+#include "runtime_compile_link.h"
+#include "include/runtime_compile_link_inc.h"
+
+int comp_long(long x, long y)
+{
+  return x < y ;
+}
+
+kernel void runtime_compile_link_a(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  int j = comp_long(src1[i], src2[i]);
+  dst[i] = j ? 3 : 4;
+}
diff --git a/kernels/runtime_compile_link_b.cl b/kernels/runtime_compile_link_b.cl
new file mode 100644
index 0000000..89b5a2d
--- /dev/null
+++ b/kernels/runtime_compile_link_b.cl
@@ -0,0 +1,9 @@
+#include "runtime_compile_link.h"
+#include "include/runtime_compile_link_inc.h"
+
+kernel void runtime_compile_link_b(global long *src1, global long *src2, global long *dst) {
+  int i = get_global_id(0);
+  int j = comp_long(src1[i], src2[i]);
+  dst[i] = j ? 3 : 4;
+  int k = greater(src1[i], src2[i]);
+}
diff --git a/kernels/test_cl_finish.cl b/kernels/test_cl_finish.cl
new file mode 100644
index 0000000..723949c
--- /dev/null
+++ b/kernels/test_cl_finish.cl
@@ -0,0 +1,12 @@
+
+
+__kernel void
+test_cl_finish(__global int *src, __global int *dst, int n, int num_threads)
+{
+	int tid, pos;
+
+	tid = get_global_id(0);
+	for (pos=tid; pos < n; pos+=num_threads) {
+		dst[pos] = src[pos];
+	}
+}
diff --git a/kernels/test_copy_buffer.cl b/kernels/test_copy_buffer.cl
new file mode 100644
index 0000000..6f2fd22
--- /dev/null
+++ b/kernels/test_copy_buffer.cl
@@ -0,0 +1,6 @@
+__kernel void
+test_copy_buffer(__global float* src, __global float* dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = src[id];
+}
diff --git a/kernels/test_copy_buffer_row.cl b/kernels/test_copy_buffer_row.cl
new file mode 100644
index 0000000..e33380f
--- /dev/null
+++ b/kernels/test_copy_buffer_row.cl
@@ -0,0 +1,8 @@
+__kernel void
+test_copy_buffer_row(__global int *src, __global int *dst, __global int *data)
+{
+  int row = data[0];
+  int size = data[1];
+  int id = (int) get_global_id(0);
+  for (; id < size; id += row) dst[id] = src[id];
+}
diff --git a/kernels/test_copy_image.cl b/kernels/test_copy_image.cl
new file mode 100644
index 0000000..a5ee5e8
--- /dev/null
+++ b/kernels/test_copy_image.cl
@@ -0,0 +1,10 @@
+__kernel void
+test_copy_image(__read_only image2d_t src, __write_only image2d_t dst, sampler_t sampler)
+{
+  int2 coord;
+  int4 color;
+  coord.x = (int)get_global_id(0);
+  coord.y = (int)get_global_id(1);
+  color = read_imagei(src, sampler, coord);
+  write_imagei(dst, coord, color);
+}
diff --git a/kernels/test_copy_image1.cl b/kernels/test_copy_image1.cl
new file mode 100644
index 0000000..28e7a7d
--- /dev/null
+++ b/kernels/test_copy_image1.cl
@@ -0,0 +1,33 @@
+#define S(A,B,C) CLK_NORMALIZED_COORDS_##A | CLK_ADDRESS_##B | CLK_FILTER_##C
+
+#define COPY_IMAGE(_dst, _sampler, scoord, dcoord) \
+  color = read_imagei(src, _sampler, scoord);\
+  write_imagei(_dst, dcoord, color)
+
+__kernel void
+test_copy_image1(__read_only image2d_t src,
+                 __write_only image2d_t dst0,
+                 sampler_t sampler0,
+                 __write_only image2d_t dst1,
+                 __write_only image2d_t dst2,
+                 __write_only image2d_t dst3,
+                 __write_only image2d_t dst4,
+                 float w_inv, float h_inv)
+{
+  const sampler_t sampler1 = S(FALSE, REPEAT, NEAREST);
+  const sampler_t sampler2 = S(FALSE, CLAMP, NEAREST);
+  const sampler_t sampler3 = S(FALSE, MIRRORED_REPEAT, NEAREST);
+  const sampler_t sampler4 = S(TRUE, REPEAT, NEAREST);
+  int2 coord;
+  float2 fcoord;
+  int4 color;
+  coord.x = (int)get_global_id(0);
+  coord.y = (int)get_global_id(1);
+  fcoord.x = coord.x * w_inv;
+  fcoord.y = coord.y * h_inv;
+  COPY_IMAGE(dst0, sampler0, coord, coord);
+  COPY_IMAGE(dst1, sampler1, coord, coord);
+  COPY_IMAGE(dst2, sampler2, coord, coord);
+  COPY_IMAGE(dst3, sampler3, coord, coord);
+  COPY_IMAGE(dst4, sampler4, fcoord, coord);
+}
diff --git a/kernels/test_copy_image_1d.cl b/kernels/test_copy_image_1d.cl
new file mode 100644
index 0000000..88428bb
--- /dev/null
+++ b/kernels/test_copy_image_1d.cl
@@ -0,0 +1,9 @@
+__kernel void
+test_copy_image_1d(__read_only image1d_t src, __write_only image1d_t dst, sampler_t sampler)
+{
+  int coord;
+  int4 color;
+  coord = (int)get_global_id(0);
+  color = read_imagei(src, sampler, coord);
+  write_imagei(dst, coord, color);
+}
diff --git a/kernels/test_copy_image_3d.cl b/kernels/test_copy_image_3d.cl
new file mode 100644
index 0000000..103fb69
--- /dev/null
+++ b/kernels/test_copy_image_3d.cl
@@ -0,0 +1,28 @@
+__kernel void
+test_copy_image_3d(__read_only image3d_t src,
+                   __write_only image3d_t dst,
+                   sampler_t sampler,
+                   __write_only image2d_t buf0,
+                   __write_only image2d_t buf1,
+                   __write_only image2d_t buf2,
+                   __write_only image2d_t buf3)
+{
+  int4 coord;
+  int2 coord2;
+  float4 color;
+  coord.x = (int)get_global_id(0);
+  coord.y = (int)get_global_id(1);
+  coord.z = (int)get_global_id(2);
+  coord2.x = coord.x;
+  coord2.y = coord.y;
+  color = read_imagef(src, sampler, coord);
+  write_imagef(dst, coord, color);
+  if (coord.z == 0)
+    write_imagef(buf0, coord2, color);
+  else if (coord.z == 1)
+    write_imagef(buf1, coord2, color);
+  else if (coord.z == 2)
+    write_imagef(buf2, coord2, color);
+  else if (coord.z == 3)
+    write_imagef(buf3, coord2, color);
+}
diff --git a/kernels/test_fill_gl_image.cl b/kernels/test_fill_gl_image.cl
new file mode 100644
index 0000000..4250a57
--- /dev/null
+++ b/kernels/test_fill_gl_image.cl
@@ -0,0 +1,11 @@
+__kernel void
+test_fill_gl_image(image2d_t img, int color)
+{
+	int2 coord;
+        float4 color_v4;
+        coord.x = get_global_id(0);
+        coord.y = get_global_id(1);
+        color_v4 = (float4){((color >> 24) & 0xFF), (color >> 16) & 0xFF, (color >> 8) & 0xFF, color & 0xFF};
+        color_v4 = color_v4 / 255.0f;
+        write_imagef(img, coord, color_v4);
+}
diff --git a/kernels/test_fill_image.cl b/kernels/test_fill_image.cl
new file mode 100644
index 0000000..3760568
--- /dev/null
+++ b/kernels/test_fill_image.cl
@@ -0,0 +1,13 @@
+__kernel void
+test_fill_image(__write_only image2d_t dst, uint color)
+{
+  int2 coord;
+  int4 color4;
+  color4.s0  = (color >> 24) & 0xFF;
+  color4.s1  = (color >> 16) & 0xFF;
+  color4.s2  = (color >> 8) & 0xFF;
+  color4.s3  = color & 0xFF;
+  coord.x = (int)get_global_id(0);
+  coord.y = (int)get_global_id(1);
+  write_imagei(dst, coord, color4);
+}
diff --git a/kernels/test_fill_image0.cl b/kernels/test_fill_image0.cl
new file mode 100644
index 0000000..9428092
--- /dev/null
+++ b/kernels/test_fill_image0.cl
@@ -0,0 +1,9 @@
+__kernel void
+test_fill_image0(__write_only image2d_t dst)
+{
+  int2 coord;
+  coord.x = (int)get_global_id(0);
+  coord.y = (int)get_global_id(1);
+  int4 color4 = {coord.y & 0xFF, (coord.y & 0xFF00) >> 8, coord.x & 0xFF, (coord.x & 0xFF00) >> 8};
+  write_imagei(dst, coord, color4);
+}
diff --git a/kernels/test_fill_image_1d.cl b/kernels/test_fill_image_1d.cl
new file mode 100644
index 0000000..db922af
--- /dev/null
+++ b/kernels/test_fill_image_1d.cl
@@ -0,0 +1,8 @@
+__kernel void
+test_fill_image_1d(__write_only image1d_t dst)
+{
+  int coord;
+  coord = (int)get_global_id(0);
+  uint4 color4 = {0, 1, 2 ,3};
+  write_imageui(dst, coord, color4);
+}
diff --git a/kernels/test_fill_image_3d.cl b/kernels/test_fill_image_3d.cl
new file mode 100644
index 0000000..4988f69
--- /dev/null
+++ b/kernels/test_fill_image_3d.cl
@@ -0,0 +1,14 @@
+__kernel void
+test_fill_image_3d(__write_only image3d_t dst, uint color)
+{
+  int4 coord;
+  int4 color4;
+  color4.s0  = (color >> 24) & 0xFF;
+  color4.s1  = (color >> 16) & 0xFF;
+  color4.s2  = (color >> 8) & 0xFF;
+  color4.s3  = color & 0xFF;
+  coord.x = (int)get_global_id(0);
+  coord.y = (int)get_global_id(1);
+  coord.z = (int)get_global_id(2);
+  write_imagei(dst, coord, color4);
+}
diff --git a/kernels/test_fill_image_3d_2.cl b/kernels/test_fill_image_3d_2.cl
new file mode 100644
index 0000000..1f9eaa1
--- /dev/null
+++ b/kernels/test_fill_image_3d_2.cl
@@ -0,0 +1,10 @@
+__kernel void
+test_fill_image_3d_2(__write_only image3d_t dst)
+{
+  int4 coord;
+  int4 color4 = {0x12, 0x34, 0x56, 0x78};
+  coord.x = (int)get_global_id(0);
+  coord.y = (int)get_global_id(1);
+  coord.z = (int)get_global_id(2);
+  write_imagei(dst, coord, color4);
+}
diff --git a/kernels/test_get_arg_info.cl b/kernels/test_get_arg_info.cl
new file mode 100644
index 0000000..43a804b
--- /dev/null
+++ b/kernels/test_get_arg_info.cl
@@ -0,0 +1,8 @@
+typedef struct _test_arg_struct {
+    int a;
+    int b;
+}test_arg_struct;
+
+kernel void test_get_arg_info(read_only global float const volatile *src, read_write local int read_only *dst, test_arg_struct extra) {
+
+}
diff --git a/kernels/test_get_image_info.cl b/kernels/test_get_image_info.cl
new file mode 100644
index 0000000..8f69b75
--- /dev/null
+++ b/kernels/test_get_image_info.cl
@@ -0,0 +1,13 @@
+__kernel void
+test_get_image_info(__write_only image3d_t src, __global int *size, __global int *fmt)
+{
+  int id = (int)get_global_id(0);
+  int w, h, depth;
+  w = get_image_width(src);
+  h = get_image_height(src);
+  depth = get_image_depth(src);
+  int channel_data_type = get_image_channel_data_type(src);
+  int channel_order = get_image_channel_order(src);
+  size[id] = (w << 20 | h << 8  | depth);
+  fmt[id] = (channel_data_type << 16 | channel_order);
+}
diff --git a/kernels/test_get_image_info_array.cl b/kernels/test_get_image_info_array.cl
new file mode 100644
index 0000000..333da77
--- /dev/null
+++ b/kernels/test_get_image_info_array.cl
@@ -0,0 +1,25 @@
+__kernel void
+test_get_image_info_array(__write_only image1d_array_t a1, __write_only image2d_array_t a2, __global int *result)
+{
+  int w, h, array_sz;
+
+  w = get_image_width(a1);
+  array_sz = (int)get_image_array_size(a1);
+  int channel_data_type = get_image_channel_data_type(a1);
+  int channel_order = get_image_channel_order(a1);
+  result[0] = w;
+  result[1] = array_sz;
+  result[2] = channel_data_type;
+  result[3] = channel_order;
+
+  w = get_image_width(a2);
+  h = get_image_height(a2);
+  array_sz = (int)get_image_array_size(a2);
+  channel_data_type = get_image_channel_data_type(a2);
+  channel_order = get_image_channel_order(a2);
+  result[4] = w;
+  result[5] = h;
+  result[6] = array_sz;
+  result[7] = channel_data_type;
+  result[8] = channel_order;
+}
diff --git a/kernels/test_movforphi_undef.cl b/kernels/test_movforphi_undef.cl
new file mode 100644
index 0000000..035c02a
--- /dev/null
+++ b/kernels/test_movforphi_undef.cl
@@ -0,0 +1,18 @@
+__kernel void
+test_movforphi_undef(__read_only image2d_t src, __write_only image2d_t dst, sampler_t sampler)
+{
+  int2 coord, dstCoord;
+  int4 color;
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  dstCoord.x = x;
+  dstCoord.y = y;
+  coord.y = y;
+  for(int j = -8; j < 2; j++)
+  {
+    coord.x = j + x;
+    color = read_imagei(src, sampler, coord);
+    if (j == 1 + x)
+      write_imagei(dst, dstCoord, color);
+  }
+}
diff --git a/kernels/test_printf.cl b/kernels/test_printf.cl
new file mode 100644
index 0000000..84bb478
--- /dev/null
+++ b/kernels/test_printf.cl
@@ -0,0 +1,38 @@
+__kernel void
+test_printf(void)
+{
+  int x = (int)get_global_id(0);
+  int y = (int)get_global_id(1);
+  int z = (int)get_global_id(2);
+  uint a = 'x';
+  float f = 5.0f;
+  int3 vec;
+  vec.x = x;
+  vec.y = y;
+  vec.z = z;
+
+  if (x == 0 && y == 0 && z == 0) {
+    printf("--- Welcome to the printf test of %s ---\n", "Intel Beignet");
+
+    printf("### output a char is %c\n", a);
+  }
+
+  if (x % 15 == 0)
+    if (y % 3 == 0)
+      if (z % 7 == 0)
+        printf("######## global_id(x, y, z) = %v3d, global_size(d0, d1, d3) = (%d, %d, %d)\n",
+                vec, get_global_size(0), get_global_size(1), get_global_size(2));
+
+  if (x == 1)
+    if (y == 0) {
+      if (z % 2 == 0)
+          printf("#### output a float is %f\n", f);
+      else
+          printf("#### output a float to int is %d\n", f);
+    }
+
+  if (x == 0 && y == 0 && z == 0) {
+    printf("--- End to the printf test ---\n");
+  }
+
+}
diff --git a/kernels/test_write_only.cl b/kernels/test_write_only.cl
new file mode 100644
index 0000000..27c7acb
--- /dev/null
+++ b/kernels/test_write_only.cl
@@ -0,0 +1,6 @@
+__kernel void
+test_write_only(__global int *dst)
+{
+  int id = (int)get_global_id(0);
+  dst[id] = id;
+}
diff --git a/setup_fulsim_hsw.sh b/setup_fulsim_hsw.sh
new file mode 100644
index 0000000..140be66
--- /dev/null
+++ b/setup_fulsim_hsw.sh
@@ -0,0 +1,5 @@
+export INTEL_DEVID_OVERRIDE=0x0094
+export DEVICE=hsw_m0
+export OCL_FULSIM_RUN=1
+export OCL_FULSIM_DEBUG_MODE=$1
+
diff --git a/setup_fulsim_ivb.sh b/setup_fulsim_ivb.sh
new file mode 100644
index 0000000..9df9082
--- /dev/null
+++ b/setup_fulsim_ivb.sh
@@ -0,0 +1,5 @@
+export INTEL_DEVID_OVERRIDE=0x0166     # or, 0x0112
+export DEVICE=ivb_m_gt2                #     snb_gt2 for SNB GT2 desktop
+export OCL_SIMULATOR=1                 # 0 -> HW, 1 -> fulsim, 2 -> perfsim
+export OCL_FULSIM_DEBUG_MODE=$1
+
diff --git a/setup_perfsim_ivb.sh b/setup_perfsim_ivb.sh
new file mode 100644
index 0000000..4cfdd1a
--- /dev/null
+++ b/setup_perfsim_ivb.sh
@@ -0,0 +1,4 @@
+export INTEL_DEVID_OVERRIDE=0x0166     # or, 0x0112
+export DEVICE=ivb_m_gt2                #     snb_gt2 for SNB GT2 desktop
+export OCL_SIMULATOR=2                 # 0 -> HW, 1 -> fulsim, 2 -> perfsim
+
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000..fc1479e
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,2 @@
+OCLConfig.h
+libcl.so
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..ce16a8c
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,126 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}
+                    ${DRM_INCLUDE_DIRS}
+                    ${DRM_INCLUDE_DIRS}/../
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../backend/src/backend/
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../include
+                    ${MESA_SOURCE_INCLUDES})
+
+macro (MakeKernelBinStr KERNEL_PATH KERNEL_FILES)
+foreach (KF ${KERNEL_FILES})
+  set (input_file ${KERNEL_PATH}/${KF}.cl)
+  set (output_file ${KERNEL_PATH}/${KF}_str.c)
+  list (APPEND KERNEL_STR_FILES ${output_file})
+  if(GEN_PCI_ID)
+    add_custom_command(
+      OUTPUT ${output_file}
+      COMMAND rm -rf ${output_file}
+      COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file} -t${GEN_PCI_ID}
+      DEPENDS ${input_file} ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater)
+  else(GEN_PCI_ID)
+    add_custom_command(
+      OUTPUT ${output_file}
+      COMMAND rm -rf ${output_file}
+      COMMAND ${GBE_BIN_GENERATER} -s ${input_file} -o${output_file}
+      DEPENDS ${input_file} ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater)
+  endif(GEN_PCI_ID)
+endforeach (KF)
+endmacro (MakeKernelBinStr)
+
+macro (MakeBuiltInKernelStr KERNEL_PATH KERNEL_FILES)
+  set (output_file ${KERNEL_PATH}/${BUILT_IN_NAME}.cl)
+  set (file_content)
+  file (REMOVE ${output_file})
+  foreach (KF ${KERNEL_NAMES})
+    set (input_file ${KERNEL_PATH}/${KF}.cl)
+    file(READ ${input_file} file_content )
+    STRING(REGEX REPLACE ";" "\\\\;" file_content "${file_content}")
+    file(APPEND ${output_file} ${file_content})
+  endforeach (KF)
+endmacro (MakeBuiltInKernelStr)
+
+set (KERNEL_STR_FILES)
+set (KERNEL_NAMES cl_internal_copy_buf_align4
+cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset
+cl_internal_copy_buf_unalign_dst_offset cl_internal_copy_buf_unalign_src_offset
+cl_internal_copy_buf_rect cl_internal_copy_image_1d_to_1d cl_internal_copy_image_2d_to_2d
+cl_internal_copy_image_3d_to_2d cl_internal_copy_image_2d_to_3d cl_internal_copy_image_3d_to_3d
+cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer
+cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d
+cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
+cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
+cl_internal_fill_buf_align128 cl_internal_fill_image_1d
+cl_internal_fill_image_1d_array cl_internal_fill_image_2d
+cl_internal_fill_image_2d_array cl_internal_fill_image_3d)
+set (BUILT_IN_NAME  cl_internal_built_in_kernel)
+MakeBuiltInKernelStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
+MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
+MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${BUILT_IN_NAME}")
+
+set(OPENCL_SRC
+    ${KERNEL_STR_FILES}
+    cl_api.c
+    cl_alloc.c
+    cl_kernel.c
+    cl_program.c
+    cl_gbe_loader.cpp
+    cl_sampler.c
+    cl_event.c
+    cl_enqueue.c
+    cl_image.c
+    cl_mem.c
+    cl_platform_id.c
+    cl_extensions.c
+    cl_device_id.c
+    cl_context.c
+    cl_command_queue.c
+    cl_command_queue.h
+    cl_command_queue_gen7.c
+    cl_thread.c
+    cl_driver.h
+    cl_driver.cpp
+    cl_driver_defs.c
+    intel/intel_gpgpu.c
+    intel/intel_batchbuffer.c
+    intel/intel_driver.c
+    performance.c)
+
+if (X11_FOUND)
+  set(CMAKE_CXX_FLAGS "-DHAS_X11 ${CMAKE_CXX_FLAGS}")
+  set(CMAKE_C_FLAGS "-DHAS_X11 ${CMAKE_C_FLAGS}")
+  set(OPENCL_SRC
+      ${OPENCL_SRC}
+      x11/dricommon.c
+      x11/va_dri2.c)
+endif (X11_FOUND)
+
+if (EGL_FOUND AND MESA_SOURCE_FOUND)
+set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/mesa_egl_extension.c x11/mesa_egl_res_share.c intel/intel_dri_resource_sharing.c)
+SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS}")
+SET(OPTIONAL_EGL_LIBRARY "${EGL_LIBRARY}")
+else(EGL_FOUND AND MESA_SOURCE_FOUND)
+SET(OPTIONAL_EGL_LIBRARY "")
+endif (EGL_FOUND AND MESA_SOURCE_FOUND)
+
+if (OCLIcd_FOUND)
+set (OPENCL_SRC ${OPENCL_SRC} cl_khr_icd.c)
+SET(CMAKE_CXX_FLAGS "-DHAS_OCLIcd ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_C_FLAGS "-DHAS_OCLIcd ${CMAKE_C_FLAGS}")
+endif (OCLIcd_FOUND)
+
+SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Bsymbolic,--allow-shlib-undefined")
+
+link_directories (${LLVM_LIBRARY_DIR} ${DRM_LIBDIR})
+add_library(cl SHARED ${OPENCL_SRC})
+target_link_libraries(
+                      cl
+                      ${X11_LIBRARIES}
+                      ${XEXT_LIBRARIES}
+                      ${XFIXES_LIBRARIES}
+                      ${DRM_INTEL_LIBRARIES}
+                      ${DRM_LIBRARIES}
+                      ${CMAKE_THREAD_LIBS_INIT}
+                      ${CMAKE_DL_LIBS}
+                      ${OPENGL_LIBRARIES}
+                      ${OPTIONAL_EGL_LIBRARY})
+install (TARGETS cl LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
diff --git a/src/OCLConfig.h.in b/src/OCLConfig.h.in
new file mode 100644
index 0000000..71de4b3
--- /dev/null
+++ b/src/OCLConfig.h.in
@@ -0,0 +1,6 @@
+// the configured options and settings for LIBCL
+#define LIBCL_DRIVER_VERSION_MAJOR @LIBCL_DRIVER_VERSION_MAJOR@
+#define LIBCL_DRIVER_VERSION_MINOR @LIBCL_DRIVER_VERSION_MINOR@
+#define LIBCL_DRIVER_VERSION_PATCH @LIBCL_DRIVER_VERSION_PATCH@
+#define LIBCL_C_VERSION_MAJOR @LIBCL_C_VERSION_MAJOR@
+#define LIBCL_C_VERSION_MINOR @LIBCL_C_VERSION_MINOR@
diff --git a/src/cl_alloc.c b/src/cl_alloc.c
new file mode 100644
index 0000000..93d2e6a
--- /dev/null
+++ b/src/cl_alloc.c
@@ -0,0 +1,88 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_alloc.h"
+#include "cl_utils.h"
+
+#include <stdlib.h>
+#include <assert.h>
+#include <malloc.h>
+
+static volatile int32_t cl_alloc_n = 0;
+
+LOCAL void*
+cl_malloc(size_t sz)
+{
+  void * p = NULL;
+  atomic_inc(&cl_alloc_n);
+  p = malloc(sz);
+  assert(p);
+  return p;
+}
+
+LOCAL void*
+cl_aligned_malloc(size_t sz, size_t align)
+{
+  void * p = NULL;
+  atomic_inc(&cl_alloc_n);
+  p = memalign(align, sz);
+  assert(p);
+  return p;
+}
+
+LOCAL void*
+cl_calloc(size_t n, size_t elem_size)
+{
+  void *p = NULL;
+  atomic_inc(&cl_alloc_n);
+  p = calloc(n, elem_size);
+  assert(p);
+  return p;
+}
+
+LOCAL void*
+cl_realloc(void *ptr, size_t sz)
+{
+  if (ptr == NULL)
+    atomic_inc(&cl_alloc_n);
+  return realloc(ptr, sz);
+}
+
+LOCAL void
+cl_free(void *ptr)
+{
+  if (ptr == NULL)
+    return;
+  atomic_dec(&cl_alloc_n);
+  free(ptr);
+  ptr = NULL;
+}
+
+LOCAL size_t
+cl_report_unfreed(void)
+{
+  return cl_alloc_n;
+}
+
+LOCAL void
+cl_report_set_all_freed(void)
+{
+  cl_alloc_n = 0;
+}
+
diff --git a/src/cl_alloc.h b/src/cl_alloc.h
new file mode 100644
index 0000000..9b463ed
--- /dev/null
+++ b/src/cl_alloc.h
@@ -0,0 +1,47 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_ALLOC_H__
+#define __CL_ALLOC_H__
+
+#include "cl_internals.h"
+#include <stdlib.h>
+
+/* Return a valid pointer for the requested memory block size */
+extern void *cl_malloc(size_t sz);
+
+/* Aligned malloc */
+extern void* cl_aligned_malloc(size_t sz, size_t align);
+
+/* malloc + memzero */
+extern void *cl_calloc(size_t n, size_t elem_size);
+
+/* Regular realloc */
+extern void *cl_realloc(void *ptr, size_t sz);
+
+/* Free a pointer allocated with cl_*alloc */
+extern void  cl_free(void *ptr);
+
+/* We count the number of allocation. This function report the number of
+ * allocation still unfreed
+ */
+extern size_t cl_report_unfreed(void);
+
+#endif /* __CL_ALLOC_H__ */
+
diff --git a/src/cl_api.c b/src/cl_api.c
new file mode 100644
index 0000000..630511f
--- /dev/null
+++ b/src/cl_api.c
@@ -0,0 +1,3341 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_platform_id.h"
+#include "cl_device_id.h"
+#include "cl_context.h"
+#include "cl_command_queue.h"
+#include "cl_enqueue.h"
+#include "cl_event.h"
+#include "cl_program.h"
+#include "cl_kernel.h"
+#include "cl_mem.h"
+#include "cl_image.h"
+#include "cl_sampler.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include "CL/cl_intel.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+
+#include "performance.h"
+
+#ifndef CL_VERSION_1_2
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+typedef intptr_t cl_device_partition_property;
+#endif
+
+#define FILL_GETINFO_RET(TYPE, ELT, VAL, RET) \
+	do { \
+	  if (param_value && param_value_size < sizeof(TYPE)*ELT) \
+	      return CL_INVALID_VALUE;  \
+	  if (param_value) { \
+	      memcpy(param_value, (VAL), sizeof(TYPE)*ELT); \
+	  } \
+          \
+	  if (param_value_size_ret) \
+	      *param_value_size_ret = sizeof(TYPE)*ELT; \
+	  return RET; \
+	} while(0)
+
+inline cl_int
+handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
+              cl_event* event, enqueue_data* data, cl_command_type type)
+{
+  cl_int status = cl_event_wait_events(num, wait_list, queue);
+  cl_event e = NULL;
+  if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) {
+    e = cl_event_new(queue->ctx, queue, type, event!=NULL);
+
+    /* if need profiling, add the submit timestamp here. */
+    if (e->type != CL_COMMAND_USER &&
+	    e->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+	cl_event_get_timestamp(e, CL_PROFILING_COMMAND_QUEUED);
+    }
+
+    if(event != NULL)
+      *event = e;
+    if(status == CL_ENQUEUE_EXECUTE_DEFER) {
+      cl_event_new_enqueue_callback(e, data, num, wait_list);
+    }
+  }
+  queue->current_event = e;
+  return status;
+}
+
+/* The following code checking overlap is from Appendix of openCL spec 1.1 */
+inline cl_bool check_copy_overlap(const size_t src_offset[3],
+                                  const size_t dst_offset[3],
+                                  const size_t region[3],
+                                  size_t row_pitch, size_t slice_pitch)
+{
+  const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
+  const size_t src_max[] = {src_offset[0] + region[0],
+                            src_offset[1] + region[1],
+                            src_offset[2] + region[2]};
+  const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]};
+  const size_t dst_max[] = {dst_offset[0] + region[0],
+                            dst_offset[1] + region[1],
+                            dst_offset[2] + region[2]};
+  // Check for overlap
+  cl_bool overlap = CL_TRUE;
+  unsigned i;
+  size_t dst_start = dst_offset[2] * slice_pitch +
+                     dst_offset[1] * row_pitch + dst_offset[0];
+  size_t dst_end = dst_start + (region[2] * slice_pitch +
+                   region[1] * row_pitch + region[0]);
+  size_t src_start = src_offset[2] * slice_pitch +
+                     src_offset[1] * row_pitch + src_offset[0];
+  size_t src_end = src_start + (region[2] * slice_pitch +
+                   region[1] * row_pitch + region[0]);
+
+  for (i=0; i != 3; ++i) {
+    overlap = overlap && (src_min[i] < dst_max[i])
+                      && (src_max[i] > dst_min[i]);
+  }
+
+  if (!overlap) {
+    size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ?
+                          src_offset[0] + region[0] - row_pitch : 0;
+    size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ?
+                          dst_offset[0] + region[0] - row_pitch : 0;
+    if ( (delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
+         (delta_dst_x > 0 && delta_dst_x > src_offset[0]) ) {
+      if ( (src_start <= dst_start && dst_start < src_end) ||
+           (dst_start <= src_start && src_start < dst_end) )
+        overlap = CL_TRUE;
+    }
+    if (region[2] > 1) {
+      size_t src_height = slice_pitch / row_pitch;
+      size_t dst_height = slice_pitch / row_pitch;
+      size_t delta_src_y = (src_offset[1] + region[1] > src_height) ?
+                            src_offset[1] + region[1] - src_height : 0;
+      size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ?
+                            dst_offset[1] + region[1] - dst_height : 0;
+      if ( (delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
+           (delta_dst_y > 0 && delta_dst_y > src_offset[1]) ) {
+        if ( (src_start <= dst_start && dst_start < src_end) ||
+             (dst_start <= src_start && src_start < dst_end) )
+          overlap = CL_TRUE;
+      }
+    }
+  }
+  return overlap;
+}
+
+static cl_int
+cl_check_device_type(cl_device_type device_type)
+{
+  const cl_device_type valid =  CL_DEVICE_TYPE_GPU
+                              | CL_DEVICE_TYPE_CPU
+                              | CL_DEVICE_TYPE_ACCELERATOR
+                              | CL_DEVICE_TYPE_DEFAULT
+                              | CL_DEVICE_TYPE_CUSTOM;
+
+  if( (device_type & valid) == 0) {
+    return CL_INVALID_DEVICE_TYPE;
+  }
+  if(UNLIKELY(!(device_type & CL_DEVICE_TYPE_DEFAULT) && !(device_type & CL_DEVICE_TYPE_GPU)))
+    return CL_DEVICE_NOT_FOUND;
+
+  return CL_SUCCESS;
+}
+
+static cl_int
+cl_device_id_is_ok(const cl_device_id device)
+{
+  if(UNLIKELY(device == NULL)) return CL_FALSE;
+  return device != cl_get_gt_device() ? CL_FALSE : CL_TRUE;
+}
+
+cl_int
+clGetPlatformIDs(cl_uint          num_entries,
+                 cl_platform_id * platforms,
+                 cl_uint *        num_platforms)
+{
+  if(UNLIKELY(platforms == NULL && num_platforms == NULL))
+    return CL_INVALID_VALUE;
+  if(UNLIKELY(num_entries == 0 && platforms != NULL))
+    return CL_INVALID_VALUE;
+
+  return cl_get_platform_ids(num_entries, platforms, num_platforms);
+}
+
+cl_int
+clGetPlatformInfo(cl_platform_id    platform,
+                  cl_platform_info  param_name,
+                  size_t            param_value_size,
+                  void *            param_value,
+                  size_t *          param_value_size_ret)
+{
+  /* Only one platform. This is easy */
+  if (UNLIKELY(platform != NULL && platform != intel_platform))
+    return CL_INVALID_PLATFORM;
+
+  return cl_get_platform_info(platform,
+                              param_name,
+                              param_value_size,
+                              param_value,
+                              param_value_size_ret);
+}
+
+cl_int
+clGetDeviceIDs(cl_platform_id platform,
+               cl_device_type device_type,
+               cl_uint        num_entries,
+               cl_device_id * devices,
+               cl_uint *      num_devices)
+{
+  cl_int err = CL_SUCCESS;
+
+  /* Check parameter consistency */
+  if (UNLIKELY(devices == NULL && num_devices == NULL))
+    return CL_INVALID_VALUE;
+  if (UNLIKELY(platform && platform != intel_platform))
+    return CL_INVALID_PLATFORM;
+  if (UNLIKELY(devices && num_entries == 0))
+    return CL_INVALID_VALUE;
+
+  err = cl_check_device_type(device_type);
+  if(err != CL_SUCCESS)
+    return err;
+
+  return cl_get_device_ids(platform,
+                           device_type,
+                           num_entries,
+                           devices,
+                           num_devices);
+}
+
+cl_int
+clGetDeviceInfo(cl_device_id   device,
+                cl_device_info param_name,
+                size_t         param_value_size,
+                void *         param_value,
+                size_t *       param_value_size_ret)
+{
+  return cl_get_device_info(device,
+                            param_name,
+                            param_value_size,
+                            param_value,
+                            param_value_size_ret);
+}
+
+cl_int
+clCreateSubDevices(cl_device_id                         in_device,
+                   const cl_device_partition_property * properties,
+                   cl_uint                              num_devices,
+                   cl_device_id *                       out_devices,
+                   cl_uint *                            num_devices_ret)
+{
+  /* Check parameter consistency */
+  if (UNLIKELY(out_devices == NULL && num_devices_ret == NULL))
+    return CL_INVALID_VALUE;
+  if (UNLIKELY(in_device == NULL && properties == NULL))
+    return CL_INVALID_VALUE;
+
+  *num_devices_ret = 0;
+  return CL_INVALID_DEVICE_PARTITION_COUNT;
+}
+
+cl_int
+clRetainDevice(cl_device_id device)
+{
+  // XXX stub for C++ Bindings
+  return CL_SUCCESS;
+}
+
+cl_int
+clReleaseDevice(cl_device_id device)
+{
+  // XXX stub for C++ Bindings
+  return CL_SUCCESS;
+}
+
+cl_context
+clCreateContext(const cl_context_properties *  properties,
+                cl_uint                        num_devices,
+                const cl_device_id *           devices,
+                void (* pfn_notify) (const char*, const void*, size_t, void*),
+                void *                         user_data,
+                cl_int *                       errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  cl_context context = NULL;
+
+  /* Assert parameters correctness */
+  INVALID_VALUE_IF (devices == NULL);
+  INVALID_VALUE_IF (num_devices == 0);
+  INVALID_VALUE_IF (pfn_notify == NULL && user_data != NULL);
+
+  /* Now check if the user is asking for the right device */
+  INVALID_DEVICE_IF (cl_device_id_is_ok(*devices) == CL_FALSE);
+
+  context = cl_create_context(properties,
+                           num_devices,
+                           devices,
+                           pfn_notify,
+                           user_data,
+                           &err);
+  initialize_env_var();
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return context;
+}
+
+cl_context
+clCreateContextFromType(const cl_context_properties *  properties,
+                        cl_device_type                 device_type,
+                        void (CL_CALLBACK *pfn_notify) (const char *, const void *, size_t, void *),
+                        void *                         user_data,
+                        cl_int *                       errcode_ret)
+{
+  cl_context context = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_device_id devices[1];
+  cl_uint num_devices = 1;
+
+  INVALID_VALUE_IF (pfn_notify == NULL && user_data != NULL);
+
+  err = cl_check_device_type(device_type);
+  if(err != CL_SUCCESS) {
+    goto error;
+  }
+
+  err = cl_get_device_ids(NULL,
+                          device_type,
+                          1,
+                          &devices[0],
+                          &num_devices);
+  if (err != CL_SUCCESS) {
+    goto error;
+  }
+
+  context = cl_create_context(properties,
+                              num_devices,
+                              devices,
+                              pfn_notify,
+                              user_data,
+                              &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return context;
+}
+
+cl_int
+clRetainContext(cl_context context)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+  cl_context_add_ref(context);
+error:
+  return err;
+}
+
+cl_int
+clReleaseContext(cl_context context)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+  cl_context_delete(context);
+error:
+  return err;
+}
+
+cl_int
+clGetContextInfo(cl_context      context,
+                 cl_context_info param_name,
+                 size_t          param_value_size,
+                 void *          param_value,
+                 size_t *        param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+
+  if (param_name == CL_CONTEXT_DEVICES) {
+    FILL_GETINFO_RET (cl_device_id, 1, &context->device, CL_SUCCESS);
+  } else if (param_name == CL_CONTEXT_NUM_DEVICES) {
+    cl_uint n = 1;
+    FILL_GETINFO_RET (cl_uint, 1, &n, CL_SUCCESS);
+  } else if (param_name == CL_CONTEXT_REFERENCE_COUNT) {
+    cl_uint ref = context->ref_n;
+    FILL_GETINFO_RET (cl_uint, 1, &ref, CL_SUCCESS);
+  } else if (param_name == CL_CONTEXT_PROPERTIES) {
+    if(context->prop_len > 0) {
+      FILL_GETINFO_RET (cl_context_properties, context->prop_len, context->prop_user, CL_SUCCESS);
+    } else {
+      cl_context_properties n = 0;
+      FILL_GETINFO_RET (cl_context_properties, 1, &n, CL_SUCCESS);
+    }
+  } else {
+    return CL_INVALID_VALUE;
+  }
+
+error:
+  return err;
+}
+
+cl_command_queue
+clCreateCommandQueue(cl_context                   context,
+                     cl_device_id                 device,
+                     cl_command_queue_properties  properties,
+                     cl_int *                     errcode_ret)
+{
+  cl_command_queue queue = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+
+  INVALID_DEVICE_IF (device != context->device);
+  INVALID_VALUE_IF (properties & ~(CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE));
+
+  if(properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {/*not supported now.*/
+    err = CL_INVALID_QUEUE_PROPERTIES;
+    goto error;
+  }
+
+  queue = cl_context_create_queue(context, device, properties, &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return queue;
+}
+
+cl_int
+clRetainCommandQueue(cl_command_queue command_queue)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_QUEUE (command_queue);
+  cl_command_queue_add_ref(command_queue);
+error:
+  return err;
+}
+
+cl_int
+clReleaseCommandQueue(cl_command_queue command_queue)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_QUEUE (command_queue);
+  cl_command_queue_delete(command_queue);
+error:
+  return err;
+}
+
+cl_int
+clGetCommandQueueInfo(cl_command_queue       command_queue,
+                      cl_command_queue_info  param_name,
+                      size_t                 param_value_size,
+                      void *                 param_value,
+                      size_t *               param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_QUEUE (command_queue);
+
+  if (param_name == CL_QUEUE_CONTEXT) {
+    FILL_GETINFO_RET (cl_context, 1, &command_queue->ctx, CL_SUCCESS);
+  } else if (param_name == CL_QUEUE_DEVICE) {
+    FILL_GETINFO_RET (cl_device_id, 1, &command_queue->ctx->device, CL_SUCCESS);
+  } else if (param_name == CL_QUEUE_REFERENCE_COUNT) {
+    cl_uint ref = command_queue->ref_n;
+    FILL_GETINFO_RET (cl_uint, 1, &ref, CL_SUCCESS);
+  } else if (param_name == CL_QUEUE_PROPERTIES) {
+    FILL_GETINFO_RET (cl_command_queue_properties, 1, &command_queue->props, CL_SUCCESS);
+  } else {
+    return CL_INVALID_VALUE;
+  }
+
+error:
+  return err;
+}
+
+cl_mem
+clCreateBuffer(cl_context    context,
+               cl_mem_flags  flags,
+               size_t        size,
+               void *        host_ptr,
+               cl_int *      errcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+
+  mem = cl_mem_new_buffer(context, flags, size, host_ptr, &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+}
+
+cl_mem
+clCreateSubBuffer(cl_mem                buffer,
+                  cl_mem_flags          flags,
+                  cl_buffer_create_type buffer_create_type,
+                  const void *          buffer_create_info,
+                  cl_int *              errcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+
+  CHECK_MEM(buffer);
+
+  mem = cl_mem_new_sub_buffer(buffer, flags, buffer_create_type,
+                       buffer_create_info, &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+}
+
+cl_mem
+clCreateImage(cl_context context,
+              cl_mem_flags flags,
+              const cl_image_format *image_format,
+              const cl_image_desc *image_desc,
+              void *host_ptr,
+              cl_int * errcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+  if (image_format == NULL) {
+    err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    goto error;
+  }
+  if (image_format->image_channel_order < CL_R ||
+          image_format->image_channel_order > CL_RGBx) {
+    err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    goto error;
+  }
+  if (image_format->image_channel_data_type < CL_SNORM_INT8 ||
+          image_format->image_channel_data_type > CL_FLOAT) {
+    err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    goto error;
+  }
+
+  if (image_desc == NULL) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+  if (image_desc->image_type <= CL_MEM_OBJECT_BUFFER ||
+          image_desc->image_type > CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+  /* buffer refers to a valid buffer memory object if image_type is
+     CL_MEM_OBJECT_IMAGE1D_BUFFER. Otherwise it must be NULL. */
+  if (image_desc->image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER &&
+         image_desc->buffer) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+  if (image_desc->num_mip_levels || image_desc->num_samples) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+
+  /* Other details check for image_desc will leave to image create. */
+  mem = cl_mem_new_image(context,
+                         flags,
+                         image_format,
+                         image_desc,
+                         host_ptr,
+                         &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+}
+
+cl_mem
+clCreateImage2D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_row_pitch,
+                void *                  host_ptr,
+                cl_int *                errcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+  cl_image_desc image_desc;
+  memset(&image_desc, 0, sizeof(image_desc));
+
+  image_desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  image_desc.image_width = image_width;
+  image_desc.image_height = image_height;
+  image_desc.image_row_pitch = image_row_pitch;
+
+  mem = cl_mem_new_image(context,
+                         flags,
+                         image_format,
+                         &image_desc,
+                         host_ptr,
+                         &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+}
+
+cl_mem
+clCreateImage3D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_depth,
+                size_t                  image_row_pitch,
+                size_t                  image_slice_pitch,
+                void *                  host_ptr,
+                cl_int *                errcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+  cl_image_desc image_desc;
+
+  image_desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+  image_desc.image_width = image_width;
+  image_desc.image_height = image_height;
+  image_desc.image_depth = image_depth;
+  image_desc.image_row_pitch = image_row_pitch;
+  image_desc.image_slice_pitch = image_slice_pitch;
+
+  mem = cl_mem_new_image(context,
+                         flags,
+                         image_format,
+                         &image_desc,
+                         host_ptr,
+                         &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+}
+
+cl_int
+clRetainMemObject(cl_mem memobj)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_MEM (memobj);
+  cl_mem_add_ref(memobj);
+error:
+  return err;
+}
+
+cl_int
+clReleaseMemObject(cl_mem memobj)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_MEM (memobj);
+  cl_mem_delete(memobj);
+error:
+  return err;
+}
+
+cl_int
+clGetSupportedImageFormats(cl_context         ctx,
+                           cl_mem_flags       flags,
+                           cl_mem_object_type image_type,
+                           cl_uint            num_entries,
+                           cl_image_format *  image_formats,
+                           cl_uint *          num_image_formats)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (ctx);
+  if (UNLIKELY(num_entries == 0 && image_formats != NULL)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if (UNLIKELY(image_type != CL_MEM_OBJECT_IMAGE1D &&
+               image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY &&
+               image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY &&
+               image_type != CL_MEM_OBJECT_IMAGE2D &&
+               image_type != CL_MEM_OBJECT_IMAGE3D)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  err = cl_image_get_supported_fmt(ctx,
+                                   image_type,
+                                   num_entries,
+                                   image_formats,
+                                   num_image_formats);
+
+error:
+  return err;
+}
+
+cl_int
+clGetMemObjectInfo(cl_mem      memobj,
+                   cl_mem_info param_name,
+                   size_t      param_value_size,
+                   void *      param_value,
+                   size_t *    param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_MEM(memobj);
+
+  err = cl_get_mem_object_info(memobj,
+                               param_name,
+                               param_value_size,
+                               param_value,
+                               param_value_size_ret);
+error:
+  return err;
+}
+
+cl_int
+clGetImageInfo(cl_mem         mem,
+               cl_image_info  param_name,
+               size_t         param_value_size,
+               void *         param_value,
+               size_t *       param_value_size_ret)
+{
+  return cl_get_image_info(mem,
+                           param_name,
+                           param_value_size,
+                           param_value,
+                           param_value_size_ret);
+}
+
+cl_int
+clSetMemObjectDestructorCallback(cl_mem  memobj,
+                                 void (CL_CALLBACK *pfn_notify) (cl_mem, void*),
+                                 void * user_data)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_MEM(memobj);
+  INVALID_VALUE_IF (pfn_notify == 0);
+
+  cl_mem_dstr_cb *cb = (cl_mem_dstr_cb*)malloc(sizeof(cl_mem_dstr_cb));
+  if (!cb) {
+    err = CL_OUT_OF_HOST_MEMORY;
+    goto error;
+  }
+
+  memset(cb, 0, sizeof(cl_mem_dstr_cb));
+  cb->pfn_notify = pfn_notify;
+  cb->user_data = user_data;
+  cb->next = memobj->dstr_cb;
+  memobj->dstr_cb = cb;
+
+error:
+  return err;
+}
+
+cl_sampler
+clCreateSampler(cl_context         context,
+                cl_bool            normalized,
+                cl_addressing_mode addressing,
+                cl_filter_mode     filter,
+                cl_int *           errcode_ret)
+{
+  cl_sampler sampler = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+  sampler = cl_sampler_new(context, normalized, addressing, filter, &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return sampler;
+}
+
+cl_int
+clRetainSampler(cl_sampler sampler)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_SAMPLER (sampler);
+  cl_sampler_add_ref(sampler);
+error:
+  return err;
+}
+
+cl_int
+clReleaseSampler(cl_sampler sampler)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_SAMPLER (sampler);
+  cl_sampler_delete(sampler);
+error:
+  return err;
+}
+
+cl_int
+clGetSamplerInfo(cl_sampler       sampler,
+                 cl_sampler_info  param_name,
+                 size_t           param_value_size,
+                 void *           param_value,
+                 size_t *         param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_SAMPLER (sampler);
+
+  if (param_name == CL_SAMPLER_REFERENCE_COUNT) {
+    FILL_GETINFO_RET (cl_uint, 1, (cl_uint*)&sampler->ref_n, CL_SUCCESS);
+  } else if (param_name == CL_SAMPLER_CONTEXT) {
+    FILL_GETINFO_RET (cl_context, 1, &sampler->ctx, CL_SUCCESS);
+  } else if (param_name == CL_SAMPLER_NORMALIZED_COORDS) {
+    FILL_GETINFO_RET (cl_bool, 1, &sampler->normalized_coords, CL_SUCCESS);
+  } else if (param_name == CL_SAMPLER_ADDRESSING_MODE) {
+    FILL_GETINFO_RET (cl_addressing_mode, 1, &sampler->address, CL_SUCCESS);
+  } else if (param_name == CL_SAMPLER_FILTER_MODE ) {
+    FILL_GETINFO_RET (cl_filter_mode, 1, &sampler->filter, CL_SUCCESS);
+  } else{
+    return CL_INVALID_VALUE;
+  }
+
+error:
+  return err;
+}
+
+cl_program
+clCreateProgramWithSource(cl_context     context,
+                          cl_uint        count,
+                          const char **  strings,
+                          const size_t * lengths,
+                          cl_int *       errcode_ret)
+{
+  cl_program program = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_uint i;
+
+  CHECK_CONTEXT (context);
+  INVALID_VALUE_IF (count == 0);
+  INVALID_VALUE_IF (strings == NULL);
+  for(i = 0; i < count; i++) {
+    if(UNLIKELY(strings[i] == NULL)) {
+      err = CL_INVALID_VALUE;
+      goto error;
+    }
+  }
+  program = cl_program_create_from_source(context,
+                                          count,
+                                          strings,
+                                          lengths,
+                                          &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return program;
+}
+
+cl_program
+clCreateProgramWithBinary(cl_context             context,
+                          cl_uint                num_devices,
+                          const cl_device_id *   devices,
+                          const size_t *         lengths,
+                          const unsigned char ** binaries,
+                          cl_int *               binary_status,
+                          cl_int *               errcode_ret)
+{
+  cl_program program = NULL;
+  cl_int err = CL_SUCCESS;
+
+  CHECK_CONTEXT (context);
+  program = cl_program_create_from_binary(context,
+                                          num_devices,
+                                          devices,
+                                          lengths,
+                                          binaries,
+                                          binary_status,
+                                          &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return program;
+}
+
+cl_program
+clCreateProgramWithBuiltInKernels(cl_context           context,
+                                  cl_uint              num_devices,
+                                  const cl_device_id * device_list,
+                                  const char *         kernel_names,
+                                  cl_int *             errcode_ret)
+{
+  cl_program program = NULL;
+  cl_int err = CL_SUCCESS;
+
+  CHECK_CONTEXT (context);
+  INVALID_VALUE_IF (kernel_names == NULL);
+  program = cl_program_create_with_built_in_kernles(context,
+                                                    num_devices,
+                                                    device_list,
+                                                    kernel_names,
+                                                    &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return program;
+}
+
+cl_int
+clRetainProgram(cl_program program)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_PROGRAM (program);
+  cl_program_add_ref(program);
+error:
+  return err;
+}
+
+cl_int
+clReleaseProgram(cl_program program)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_PROGRAM (program);
+  cl_program_delete(program);
+error:
+  return err;
+}
+
+cl_int
+clBuildProgram(cl_program            program,
+               cl_uint               num_devices,
+               const cl_device_id *  device_list,
+               const char *          options,
+               void (CL_CALLBACK *pfn_notify) (cl_program, void*),
+               void *                user_data)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_PROGRAM(program);
+  INVALID_VALUE_IF (num_devices > 1);
+  INVALID_VALUE_IF (num_devices == 0 && device_list != NULL);
+  INVALID_VALUE_IF (num_devices != 0 && device_list == NULL);
+  INVALID_VALUE_IF (pfn_notify  == 0 && user_data   != NULL);
+
+  /* Everything is easy. We only support one device anyway */
+  if (num_devices != 0) {
+    assert(program->ctx);
+    INVALID_DEVICE_IF (device_list[0] != program->ctx->device);
+  }
+
+  /* TODO support create program from binary */
+  assert(program->source_type == FROM_LLVM ||
+         program->source_type == FROM_SOURCE ||
+         program->source_type == FROM_BINARY);
+  if((err = cl_program_build(program, options)) != CL_SUCCESS) {
+    goto error;
+  }
+  program->is_built = CL_TRUE;
+
+  if (pfn_notify) pfn_notify(program, user_data);
+
+error:
+  return err;
+}
+
+cl_int
+clCompileProgram(cl_program            program ,
+                 cl_uint               num_devices ,
+                 const cl_device_id *  device_list ,
+                 const char *          options ,
+                 cl_uint               num_input_headers ,
+                 const cl_program *    input_headers ,
+                 const char **         header_include_names ,
+                 void (CL_CALLBACK *   pfn_notify )(cl_program, void *),
+                 void *                user_data )
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_PROGRAM(program);
+  INVALID_VALUE_IF (num_devices > 1);
+  INVALID_VALUE_IF (num_devices == 0 && device_list != NULL);
+  INVALID_VALUE_IF (num_devices != 0 && device_list == NULL);
+  INVALID_VALUE_IF (pfn_notify  == 0 && user_data   != NULL);
+  INVALID_VALUE_IF (num_input_headers == 0 && input_headers != NULL);
+  INVALID_VALUE_IF (num_input_headers != 0 && input_headers == NULL);
+
+  /* Everything is easy. We only support one device anyway */
+  if (num_devices != 0) {
+    assert(program->ctx);
+    INVALID_DEVICE_IF (device_list[0] != program->ctx->device);
+  }
+
+  /* TODO support create program from binary */
+  assert(program->source_type == FROM_LLVM ||
+      program->source_type == FROM_SOURCE ||
+      program->source_type == FROM_BINARY);
+  if((err = cl_program_compile(program, num_input_headers, input_headers, header_include_names, options)) != CL_SUCCESS) {
+    goto error;
+  }
+  program->is_built = CL_TRUE;
+
+  if (pfn_notify) pfn_notify(program, user_data);
+
+error:
+  return err;
+}
+
+cl_program
+clLinkProgram(cl_context            context,
+              cl_uint               num_devices,
+              const cl_device_id *  device_list,
+              const char *          options,
+              cl_uint               num_input_programs,
+              const cl_program *    input_programs,
+              void (CL_CALLBACK *   pfn_notify)(cl_program  program, void * user_data),
+              void *                user_data,
+              cl_int *              errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  cl_program program = NULL;
+  CHECK_CONTEXT (context);
+  INVALID_VALUE_IF (num_devices > 1);
+  INVALID_VALUE_IF (num_devices == 0 && device_list != NULL);
+  INVALID_VALUE_IF (num_devices != 0 && device_list == NULL);
+  INVALID_VALUE_IF (pfn_notify  == 0 && user_data   != NULL);
+  INVALID_VALUE_IF (num_input_programs == 0 && input_programs != NULL);
+  INVALID_VALUE_IF (num_input_programs != 0 && input_programs == NULL);
+
+  program = cl_program_link(context, num_input_programs, input_programs, options, &err);
+
+  program->is_built = CL_TRUE;
+
+  if (pfn_notify) pfn_notify(program, user_data);
+
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return program;
+}
+
+cl_int
+clUnloadCompiler(void)
+{
+  return CL_SUCCESS;
+}
+
+cl_int
+clUnloadPlatformCompiler(cl_platform_id platform)
+{
+  return CL_SUCCESS;
+}
+
+cl_int
+clGetProgramInfo(cl_program       program,
+                 cl_program_info  param_name,
+                 size_t           param_value_size,
+                 void *           param_value,
+                 size_t *         param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  char * ret_str = "";
+
+  CHECK_PROGRAM (program);
+
+  if (param_name == CL_PROGRAM_REFERENCE_COUNT) {
+    cl_uint ref = program->ref_n;
+    FILL_GETINFO_RET (cl_uint, 1, (&ref), CL_SUCCESS);
+  } else if (param_name == CL_PROGRAM_CONTEXT) {
+    cl_context context = program->ctx;
+    FILL_GETINFO_RET (cl_context, 1, &context, CL_SUCCESS);
+  } else if (param_name == CL_PROGRAM_NUM_DEVICES) {
+    cl_uint num_dev = 1; // Just 1 dev now.
+    FILL_GETINFO_RET (cl_uint, 1, &num_dev, CL_SUCCESS);
+  } else if (param_name == CL_PROGRAM_DEVICES) {
+    cl_device_id dev_id = program->ctx->device;
+    FILL_GETINFO_RET (cl_device_id, 1, &dev_id, CL_SUCCESS);
+  } else if (param_name == CL_PROGRAM_NUM_KERNELS) {
+    cl_uint kernels_num = program->ker_n;
+    FILL_GETINFO_RET (cl_uint, 1, &kernels_num, CL_SUCCESS);
+  } else if (param_name == CL_PROGRAM_SOURCE) {
+
+    if (!program->source)
+      FILL_GETINFO_RET (char, 1, &ret_str, CL_SUCCESS);
+    FILL_GETINFO_RET (char, (strlen(program->source) + 1),
+                   program->source, CL_SUCCESS);
+  } else if(param_name == CL_PROGRAM_KERNEL_NAMES) {
+    cl_program_get_kernel_names(program, param_value_size, (char *)param_value, param_value_size_ret);
+  } else if (param_name == CL_PROGRAM_BINARY_SIZES) {
+    if (program->binary == NULL){
+      if( program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
+      }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
+      }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
+      }else{
+        return CL_INVALID_BINARY;
+      }
+    }
+
+    if (program->binary == NULL || program->binary_sz == 0) {
+      return CL_OUT_OF_RESOURCES;
+    }
+    FILL_GETINFO_RET (size_t, 1, (&program->binary_sz), CL_SUCCESS);
+  } else if (param_name == CL_PROGRAM_BINARIES) {
+    if (param_value_size_ret)
+      *param_value_size_ret = sizeof(void*);
+    if (!param_value)
+      return CL_SUCCESS;
+
+    /* param_value points to an array of n
+       pointers allocated by the caller */
+    if (program->binary == NULL) {
+      if( program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
+      }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
+      }else if( program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
+        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
+      }else{
+        return CL_INVALID_BINARY;
+      }
+    }
+
+    if (program->binary == NULL || program->binary_sz == 0) {
+      return CL_OUT_OF_RESOURCES;
+    }
+
+    memcpy(*((void **)param_value), program->binary, program->binary_sz);
+    return CL_SUCCESS;
+  } else {
+    return CL_INVALID_VALUE;
+  }
+
+error:
+    return err;
+}
+
+cl_int
+clGetProgramBuildInfo(cl_program             program,
+                      cl_device_id           device,
+                      cl_program_build_info  param_name,
+                      size_t                 param_value_size,
+                      void *                 param_value,
+                      size_t *               param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  char * ret_str = "";
+
+  CHECK_PROGRAM (program);
+  INVALID_DEVICE_IF (device != program->ctx->device);
+
+  if (param_name == CL_PROGRAM_BUILD_STATUS) {
+    FILL_GETINFO_RET (cl_build_status, 1, &program->build_status, CL_SUCCESS);
+  } else if (param_name == CL_PROGRAM_BUILD_OPTIONS) {
+    if (program->is_built && program->build_opts)
+      ret_str = program->build_opts;
+
+    FILL_GETINFO_RET (char, (strlen(ret_str)+1), ret_str, CL_SUCCESS);
+  } else if (param_name == CL_PROGRAM_BUILD_LOG) {
+    FILL_GETINFO_RET (char, program->build_log_sz + 1, program->build_log, CL_SUCCESS);
+    if (param_value_size_ret)
+      *param_value_size_ret = program->build_log_sz + 1;
+  }else if (param_name == CL_PROGRAM_BINARY_TYPE){
+
+    FILL_GETINFO_RET (cl_uint, 1, &program->binary_type, CL_SUCCESS);
+  } else {
+    return CL_INVALID_VALUE;
+  }
+
+error:
+    return err;
+}
+
+cl_kernel
+clCreateKernel(cl_program   program,
+               const char * kernel_name,
+               cl_int *     errcode_ret)
+{
+  cl_kernel kernel = NULL;
+  cl_int err = CL_SUCCESS;
+
+  CHECK_PROGRAM (program);
+  if (program->ker_n <= 0) {
+    err = CL_INVALID_PROGRAM_EXECUTABLE;
+    goto error;
+  }
+  INVALID_VALUE_IF (kernel_name == NULL);
+  kernel = cl_program_create_kernel(program, kernel_name, &err);
+
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return kernel;
+}
+
+cl_int
+clCreateKernelsInProgram(cl_program      program,
+                         cl_uint         num_kernels,
+                         cl_kernel *     kernels,
+                         cl_uint *       num_kernels_ret)
+{
+  cl_int err = CL_SUCCESS;
+
+  CHECK_PROGRAM (program);
+  if (program->ker_n <= 0) {
+    err = CL_INVALID_PROGRAM_EXECUTABLE;
+    goto error;
+  }
+  if (kernels && num_kernels < program->ker_n) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if(num_kernels_ret)
+    *num_kernels_ret = program->ker_n;
+
+  if(kernels)
+    err = cl_program_create_kernels_in_program(program, kernels);
+
+error:
+  return err;
+}
+
+cl_int
+clRetainKernel(cl_kernel kernel)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_KERNEL(kernel);
+  cl_kernel_add_ref(kernel);
+error:
+  return err;
+}
+
+cl_int
+clReleaseKernel(cl_kernel kernel)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_KERNEL(kernel);
+  cl_kernel_delete(kernel);
+error:
+  return err;
+}
+
+cl_int
+clSetKernelArg(cl_kernel     kernel,
+               cl_uint       arg_index,
+               size_t        arg_size,
+               const void *  arg_value)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_KERNEL(kernel);
+  err = cl_kernel_set_arg(kernel, arg_index, arg_size, arg_value);
+error:
+  return err;
+}
+
+cl_int clGetKernelArgInfo(cl_kernel kernel, cl_uint arg_index, cl_kernel_arg_info param_name,
+        size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_KERNEL(kernel);
+
+  if (param_name != CL_KERNEL_ARG_ADDRESS_QUALIFIER
+          && param_name != CL_KERNEL_ARG_ACCESS_QUALIFIER
+          && param_name != CL_KERNEL_ARG_TYPE_NAME
+          && param_name != CL_KERNEL_ARG_TYPE_QUALIFIER
+          && param_name != CL_KERNEL_ARG_NAME) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (arg_index >= kernel->arg_n) {
+    err = CL_INVALID_ARG_INDEX;
+    goto error;
+  }
+
+  err = cl_get_kernel_arg_info(kernel, arg_index, param_name, param_value_size,
+          param_value, param_value_size_ret);
+
+error:
+  return err;
+}
+
+cl_int
+clGetKernelInfo(cl_kernel        kernel,
+                cl_kernel_info   param_name,
+                size_t           param_value_size,
+                void *           param_value,
+                size_t *         param_value_size_ret)
+{
+  cl_int err;
+
+  CHECK_KERNEL(kernel);
+
+  if (param_name == CL_KERNEL_CONTEXT) {
+    FILL_GETINFO_RET (cl_context, 1, &kernel->program->ctx, CL_SUCCESS);
+  } else if (param_name == CL_KERNEL_PROGRAM) {
+    FILL_GETINFO_RET (cl_program, 1, &kernel->program, CL_SUCCESS);
+  } else if (param_name == CL_KERNEL_NUM_ARGS) {
+    cl_uint n = kernel->arg_n;
+    FILL_GETINFO_RET (cl_uint, 1, &n, CL_SUCCESS);
+  } else if (param_name == CL_KERNEL_REFERENCE_COUNT) {
+    cl_int ref = kernel->ref_n;
+    FILL_GETINFO_RET (cl_int, 1, &ref, CL_SUCCESS);
+  } else if (param_name == CL_KERNEL_FUNCTION_NAME) {
+    const char * n = cl_kernel_get_name(kernel);
+    FILL_GETINFO_RET (cl_char, strlen(n)+1, n, CL_SUCCESS);
+  } else if (param_name == CL_KERNEL_ATTRIBUTES) {
+    const char * n = cl_kernel_get_attributes(kernel);
+    FILL_GETINFO_RET (cl_char, strlen(n)+1, n, CL_SUCCESS);
+  } else {
+    return CL_INVALID_VALUE;
+  }
+
+error:
+  return err;
+}
+
+cl_int
+clGetKernelWorkGroupInfo(cl_kernel                   kernel,
+                         cl_device_id                device,
+                         cl_kernel_work_group_info   param_name,
+                         size_t                      param_value_size,
+                         void *                      param_value,
+                         size_t *                    param_value_size_ret)
+{
+  return cl_get_kernel_workgroup_info(kernel,
+                                      device,
+                                      param_name,
+                                      param_value_size,
+                                      param_value,
+                                      param_value_size_ret);
+}
+
+cl_int
+clWaitForEvents(cl_uint          num_events,
+                const cl_event * event_list)
+{
+  cl_int err = CL_SUCCESS;
+  cl_context ctx = NULL;
+
+  if(num_events > 0 && event_list)
+    ctx = event_list[0]->ctx;
+
+  TRY(cl_event_check_waitlist, num_events, event_list, NULL, ctx);
+
+  while(cl_event_wait_events(num_events, event_list, NULL) == CL_ENQUEUE_EXECUTE_DEFER) {
+    usleep(8000);       //sleep 8ms to wait other thread
+  }
+
+error:
+  return err;
+}
+
+cl_int
+clGetEventInfo(cl_event      event,
+               cl_event_info param_name,
+               size_t        param_value_size,
+               void *        param_value,
+               size_t *      param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_EVENT(event);
+
+  if (param_name == CL_EVENT_COMMAND_QUEUE) {
+    FILL_GETINFO_RET (cl_command_queue, 1, &event->queue, CL_SUCCESS);
+  } else if (param_name == CL_EVENT_CONTEXT) {
+    FILL_GETINFO_RET (cl_context, 1, &event->ctx, CL_SUCCESS);
+  } else if (param_name == CL_EVENT_COMMAND_TYPE) {
+    FILL_GETINFO_RET (cl_command_type, 1, &event->type, CL_SUCCESS);
+  } else if (param_name == CL_EVENT_COMMAND_EXECUTION_STATUS) {
+    cl_event_update_status(event, 0);
+    FILL_GETINFO_RET (cl_int, 1, &event->status, CL_SUCCESS);
+  } else if (param_name == CL_EVENT_REFERENCE_COUNT) {
+    cl_uint ref = event->ref_n;
+    FILL_GETINFO_RET (cl_int, 1, &ref, CL_SUCCESS);
+  } else {
+    return CL_INVALID_VALUE;
+  }
+
+error:
+  return err;
+
+}
+
+cl_event
+clCreateUserEvent(cl_context context,
+                  cl_int *   errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  cl_event event = NULL;
+  CHECK_CONTEXT(context);
+
+  TRY_ALLOC(event, cl_event_new(context, NULL, CL_COMMAND_USER, CL_TRUE));
+
+error:
+  if(errcode_ret)
+    *errcode_ret = err;
+  return event;
+}
+
+cl_int
+clRetainEvent(cl_event  event)
+{
+  cl_int err = CL_SUCCESS;
+
+  CHECK_EVENT(event);
+  cl_event_add_ref(event);
+
+error:
+  return err;
+}
+
+cl_int
+clReleaseEvent(cl_event  event)
+{
+  cl_int err = CL_SUCCESS;
+
+  CHECK_EVENT(event);
+  cl_event_delete(event);
+
+error:
+  return err;
+}
+
+cl_int
+clSetUserEventStatus(cl_event    event,
+                     cl_int      execution_status)
+{
+  cl_int err = CL_SUCCESS;
+
+  CHECK_EVENT(event);
+  if(execution_status > CL_COMPLETE) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if(event->status != CL_SUBMITTED) {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
+
+  cl_event_set_status(event, execution_status);
+error:
+  return err;
+}
+
+cl_int
+clSetEventCallback(cl_event     event,
+                   cl_int       command_exec_callback_type,
+                   void (CL_CALLBACK * pfn_notify) (cl_event, cl_int, void *),
+                   void *       user_data)
+{
+  cl_int err = CL_SUCCESS;
+
+  CHECK_EVENT(event);
+  if((pfn_notify == NULL) ||
+    (command_exec_callback_type > CL_SUBMITTED) ||
+    (command_exec_callback_type < CL_COMPLETE)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  err = cl_event_set_callback(event, command_exec_callback_type, pfn_notify, user_data);
+
+error:
+  return err;
+
+}
+
+cl_int
+clGetEventProfilingInfo(cl_event             event,
+                        cl_profiling_info    param_name,
+                        size_t               param_value_size,
+                        void *               param_value,
+                        size_t *             param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  cl_ulong ret_val;
+
+  CHECK_EVENT(event);
+
+  if (event->type == CL_COMMAND_USER ||
+      !(event->queue->props & CL_QUEUE_PROFILING_ENABLE) ||
+          event->status != CL_COMPLETE) {
+    err = CL_PROFILING_INFO_NOT_AVAILABLE;
+    goto error;
+  }
+
+  if (param_value && param_value_size < sizeof(cl_ulong)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (param_name == CL_PROFILING_COMMAND_QUEUED) {
+    ret_val = event->timestamp[0];
+  } else if (param_name == CL_PROFILING_COMMAND_SUBMIT) {
+    ret_val = event->timestamp[1];
+  } else if (param_name == CL_PROFILING_COMMAND_START) {
+    err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_START);
+    ret_val = event->timestamp[2];
+  } else if (param_name == CL_PROFILING_COMMAND_END) {
+    err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_END);
+    ret_val = event->timestamp[3];
+  } else {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (err == CL_SUCCESS) {
+    if (param_value)
+      *(cl_ulong*)param_value = ret_val;
+    if (param_value_size_ret)
+      *param_value_size_ret = sizeof(cl_ulong);
+  }
+error:
+  return err;
+}
+
+cl_int
+clFlush(cl_command_queue command_queue)
+{
+  /* have nothing to do now, as currently
+   * clEnqueueNDRangeKernel will flush at
+   * the end of each calling. we may need
+   * to optimize it latter.*/
+  return 0;
+}
+
+cl_int
+clFinish(cl_command_queue command_queue)
+{
+  cl_int err = CL_SUCCESS;
+
+  CHECK_QUEUE (command_queue);
+  err = cl_command_queue_finish(command_queue);
+
+error:
+  return err;
+}
+
+cl_int
+clEnqueueReadBuffer(cl_command_queue command_queue,
+                    cl_mem           buffer,
+                    cl_bool          blocking_read,
+                    size_t           offset,
+                    size_t           size,
+                    void *           ptr,
+                    cl_uint          num_events_in_wait_list,
+                    const cl_event * event_wait_list,
+                    cl_event *       event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, defer_enqueue_data = { 0 };
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(buffer);
+  if (command_queue->ctx != buffer->ctx) {
+     err = CL_INVALID_CONTEXT;
+     goto error;
+  }
+
+  if (!ptr || !size || offset + size > buffer->size) {
+     err = CL_INVALID_VALUE;
+     goto error;
+  }
+
+  if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+     err = CL_INVALID_OPERATION;
+     goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+  data = &defer_enqueue_data;
+  data->type    = EnqueueReadBuffer;
+  data->mem_obj = buffer;
+  data->ptr     = ptr;
+  data->offset  = offset;
+  data->size    = size;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(event ? *event : NULL, data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
+
+error:
+  return err;
+}
+
+cl_int
+clEnqueueReadBufferRect(cl_command_queue command_queue,
+                        cl_mem           buffer,
+                        cl_bool          blocking_read,
+                        const size_t *   buffer_origin,
+                        const size_t *   host_origin,
+                        const size_t *   region,
+                        size_t           buffer_row_pitch,
+                        size_t           buffer_slice_pitch,
+                        size_t           host_row_pitch,
+                        size_t           host_slice_pitch,
+                        void *           ptr,
+                        cl_uint          num_events_in_wait_list,
+                        const cl_event * event_wait_list,
+                        cl_event *       event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(buffer);
+
+  if (command_queue->ctx != buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+     err = CL_INVALID_OPERATION;
+     goto error;
+  }
+
+  if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if(buffer_row_pitch == 0)
+    buffer_row_pitch = region[0];
+  if(buffer_slice_pitch == 0)
+    buffer_slice_pitch = region[1] * buffer_row_pitch;
+
+  if(host_row_pitch == 0)
+    host_row_pitch = region[0];
+  if(host_slice_pitch == 0)
+    host_slice_pitch = region[1] * host_row_pitch;
+
+  if (buffer_row_pitch < region[0] ||
+      host_row_pitch < region[0]) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0 ) ||
+      (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0 )) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((buffer_origin[2] + region[2] - 1) * buffer_slice_pitch
+         + (buffer_origin[1] + region[1] - 1) * buffer_row_pitch
+         + buffer_origin[0] + region[0] > buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+  data = &no_wait_data;
+  data->type        = EnqueueReadBufferRect;
+  data->mem_obj     = buffer;
+  data->ptr         = ptr;
+  data->origin[0]   = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data->origin[2] = buffer_origin[2];
+  data->host_origin[0]  = host_origin[0]; data->host_origin[1] = host_origin[1]; data->host_origin[2] = host_origin[2];
+  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
+  data->row_pitch   = buffer_row_pitch;
+  data->slice_pitch = buffer_slice_pitch;
+  data->host_row_pitch   = host_row_pitch;
+  data->host_slice_pitch = host_slice_pitch;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_READ_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(event ? *event : NULL, data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
+
+ error:
+  return err;
+}
+
+cl_int
+clEnqueueWriteBuffer(cl_command_queue    command_queue,
+                     cl_mem              buffer,
+                     cl_bool             blocking_write,
+                     size_t              offset,
+                     size_t              size,
+                     const void *        ptr,
+                     cl_uint             num_events_in_wait_list,
+                     const cl_event *    event_wait_list,
+                     cl_event *          event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(buffer);
+  if (command_queue->ctx != buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (!ptr || !size || offset + size > buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+  data = &no_wait_data;
+  data->type      = EnqueueWriteBuffer;
+  data->mem_obj   = buffer;
+  data->const_ptr = ptr;
+  data->offset    = offset;
+  data->size      = size;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_WRITE_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(event ? *event : NULL, data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
+
+ error:
+  return err;
+}
+
+cl_int
+clEnqueueWriteBufferRect(cl_command_queue     command_queue,
+                         cl_mem               buffer,
+                         cl_bool              blocking_write,
+                         const size_t *       buffer_origin,
+                         const size_t *       host_origin,
+                         const size_t *       region,
+                         size_t               buffer_row_pitch,
+                         size_t               buffer_slice_pitch,
+                         size_t               host_row_pitch,
+                         size_t               host_slice_pitch,
+                         const void *         ptr,
+                         cl_uint              num_events_in_wait_list,
+                         const cl_event *     event_wait_list,
+                         cl_event *           event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(buffer);
+
+  if (command_queue->ctx != buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
+
+  if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if(buffer_row_pitch == 0)
+    buffer_row_pitch = region[0];
+  if(buffer_slice_pitch == 0)
+    buffer_slice_pitch = region[1] * buffer_row_pitch;
+
+  if(host_row_pitch == 0)
+    host_row_pitch = region[0];
+  if(host_slice_pitch == 0)
+    host_slice_pitch = region[1] * host_row_pitch;
+
+  if (buffer_row_pitch < region[0] ||
+      host_row_pitch < region[0]) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0 ) ||
+      (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0 )) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((buffer_origin[2] + region[2] - 1) * buffer_slice_pitch
+         + (buffer_origin[1] + region[1] - 1) * buffer_row_pitch
+         + buffer_origin[0] + region[0] > buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+  data = &no_wait_data;
+  data->type        = EnqueueWriteBufferRect;
+  data->mem_obj     = buffer;
+  data->const_ptr   = ptr;
+  data->origin[0]   = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data->origin[2] = buffer_origin[2];
+  data->host_origin[0]  = host_origin[0]; data->host_origin[1] = host_origin[1]; data->host_origin[2] = host_origin[2];
+  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
+  data->row_pitch   = buffer_row_pitch;
+  data->slice_pitch = buffer_slice_pitch;
+  data->host_row_pitch   = host_row_pitch;
+  data->host_slice_pitch = host_slice_pitch;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_WRITE_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(event ? *event : NULL, data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
+
+error:
+  return err;
+}
+
+cl_int
+clEnqueueFillImage(cl_command_queue   command_queue,
+                   cl_mem             image,
+                   const void *       fill_color,
+                   const size_t *     porigin,
+                   const size_t *     pregion,
+                   cl_uint            num_events_in_wait_list,
+                   const cl_event *   event_wait_list,
+                   cl_event *         event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_IMAGE(image, src_image);
+  FIXUP_IMAGE_REGION(src_image, pregion, region);
+  FIXUP_IMAGE_ORIGIN(src_image, porigin, origin);
+
+  if (command_queue->ctx != image->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (fill_color == NULL) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!origin || !region || origin[0] + region[0] > src_image->w || origin[1] + region[1] > src_image->h || origin[2] + region[2] > src_image->depth) {
+     err = CL_INVALID_VALUE;
+     goto error;
+  }
+
+  if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (origin[2] != 0 || region[2] != 1)){
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D && (origin[2] != 0 ||origin[1] != 0 || region[2] != 1 || region[1] != 1)){
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  err = cl_image_fill(command_queue, fill_color, src_image, origin, region);
+  if (err) {
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueFillImage;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+        && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
+    err = cl_command_queue_flush(command_queue);
+  }
+
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_fill_image", "", command_queue);
+
+  return 0;
+
+ error:
+  return err;
+}
+
+cl_int
+clEnqueueFillBuffer(cl_command_queue   command_queue,
+                    cl_mem             buffer,
+                    const void *       pattern,
+                    size_t             pattern_size,
+                    size_t             offset,
+                    size_t             size,
+                    cl_uint            num_events_in_wait_list,
+                    const cl_event *   event_wait_list,
+                    cl_event *         event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+  static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
+  int i = 0;
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(buffer);
+
+  if (command_queue->ctx != buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (offset < 0 || offset + size > buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (pattern == NULL) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  for (i = 0; i < sizeof(valid_sz) / sizeof(size_t); i++) {
+    if (valid_sz[i] == pattern_size)
+      break;
+  }
+  if (i == sizeof(valid_sz) / sizeof(size_t)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (offset % pattern_size || size % pattern_size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  err = cl_mem_fill(command_queue, pattern, pattern_size, buffer, offset, size);
+  if (err) {
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueFillBuffer;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+        && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
+    err = cl_command_queue_flush(command_queue);
+  }
+
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_fill_buffer", "", command_queue);
+
+  return 0;
+
+ error:
+  return err;
+}
+
+cl_int
+clEnqueueCopyBuffer(cl_command_queue     command_queue,
+                    cl_mem               src_buffer,
+                    cl_mem               dst_buffer,
+                    size_t               src_offset,
+                    size_t               dst_offset,
+                    size_t               cb,
+                    cl_uint              num_events_in_wait_list,
+                    const cl_event *     event_wait_list,
+                    cl_event *           event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(src_buffer);
+  CHECK_MEM(dst_buffer);
+
+  if (command_queue->ctx != src_buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (command_queue->ctx != dst_buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (src_offset < 0 || src_offset + cb > src_buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if (dst_offset < 0 || dst_offset + cb > dst_buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  /* Check overlap */
+  if (src_buffer == dst_buffer
+         && (src_offset <= dst_offset && dst_offset <= src_offset + cb - 1)
+         && (dst_offset <= src_offset && src_offset <= dst_offset + cb - 1)) {
+    err = CL_MEM_COPY_OVERLAP;
+    goto error;
+  }
+
+  /* Check sub overlap */
+  if (src_buffer->type == CL_MEM_SUBBUFFER_TYPE && dst_buffer->type == CL_MEM_SUBBUFFER_TYPE ) {
+    struct _cl_mem_buffer* src_b = (struct _cl_mem_buffer*)src_buffer;
+    struct _cl_mem_buffer* dst_b = (struct _cl_mem_buffer*)dst_buffer;
+    size_t src_sub_offset = src_b->sub_offset;
+    size_t dst_sub_offset = dst_b->sub_offset;
+
+    if ((src_offset + src_sub_offset <= dst_offset + dst_sub_offset
+          && dst_offset + dst_sub_offset <= src_offset + src_sub_offset + cb - 1)
+     && (dst_offset + dst_sub_offset <= src_offset + src_sub_offset
+          && src_offset + src_sub_offset <= dst_offset + dst_sub_offset + cb - 1)) {
+      err = CL_MEM_COPY_OVERLAP;
+      goto error;
+    }
+  }
+
+  err = cl_mem_copy(command_queue, src_buffer, dst_buffer, src_offset, dst_offset, cb);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueCopyBuffer;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_COPY_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
+    err = cl_command_queue_flush(command_queue);
+  }
+
+  if(b_output_kernel_perf)
+	  time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", "", command_queue);
+
+  return 0;
+
+error:
+  return err;
+}
+
+cl_int
+clEnqueueCopyBufferRect(cl_command_queue     command_queue,
+                        cl_mem               src_buffer,
+                        cl_mem               dst_buffer,
+                        const size_t *       src_origin,
+                        const size_t *       dst_origin,
+                        const size_t *       region,
+                        size_t               src_row_pitch,
+                        size_t               src_slice_pitch,
+                        size_t               dst_row_pitch,
+                        size_t               dst_slice_pitch,
+                        cl_uint              num_events_in_wait_list,
+                        const cl_event *     event_wait_list,
+                        cl_event *           event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(src_buffer);
+  CHECK_MEM(dst_buffer);
+
+  if ((command_queue->ctx != src_buffer->ctx) ||
+      (command_queue->ctx != dst_buffer->ctx)) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (!region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if(src_row_pitch == 0)
+    src_row_pitch = region[0];
+  if(src_slice_pitch == 0)
+    src_slice_pitch = region[1] * src_row_pitch;
+
+  if(dst_row_pitch == 0)
+    dst_row_pitch = region[0];
+  if(dst_slice_pitch == 0)
+    dst_slice_pitch = region[1] * dst_row_pitch;
+
+  if (src_row_pitch < region[0] ||
+      dst_row_pitch < region[0]) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((src_slice_pitch < region[1] * src_row_pitch || src_slice_pitch % src_row_pitch != 0 ) ||
+      (dst_slice_pitch < region[1] * dst_row_pitch || dst_slice_pitch % dst_row_pitch != 0 )) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((src_origin[2] + region[2] - 1) * src_slice_pitch
+        + (src_origin[1] + region[1] - 1) * src_row_pitch
+        + src_origin[0] + region[0] > src_buffer->size
+      ||(dst_origin[2] + region[2] - 1) * dst_slice_pitch
+          + (dst_origin[1] + region[1] - 1) * dst_row_pitch
+          + dst_origin[0] + region[0] > dst_buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (src_buffer == dst_buffer && (src_row_pitch != dst_row_pitch || src_slice_pitch != dst_slice_pitch)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (src_buffer == dst_buffer &&
+      check_copy_overlap(src_origin, dst_origin, region, src_row_pitch, src_slice_pitch)) {
+    err = CL_MEM_COPY_OVERLAP;
+    goto error;
+  }
+
+  cl_mem_copy_buffer_rect(command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region,
+                          src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueCopyBufferRect;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_COPY_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
+    err = cl_command_queue_flush(command_queue);
+  }
+
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", "", command_queue);
+
+error:
+  return err;
+}
+
+cl_int
+clEnqueueReadImage(cl_command_queue      command_queue,
+                   cl_mem                mem,
+                   cl_bool               blocking_read,
+                   const size_t *        porigin,
+                   const size_t *        pregion,
+                   size_t                row_pitch,
+                   size_t                slice_pitch,
+                   void *                ptr,
+                   cl_uint               num_events_in_wait_list,
+                   const cl_event *      event_wait_list,
+                   cl_event *            event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_IMAGE(mem, image);
+  FIXUP_IMAGE_REGION(image, pregion, region);
+  FIXUP_IMAGE_ORIGIN(image, porigin, origin);
+  if (command_queue->ctx != mem->ctx) {
+     err = CL_INVALID_CONTEXT;
+     goto error;
+  }
+
+  if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
+     err = CL_INVALID_VALUE;
+     goto error;
+  }
+
+  if (!row_pitch)
+    row_pitch = image->bpp*region[0];
+  else if (row_pitch < image->bpp*region[0]) {
+     err = CL_INVALID_VALUE;
+     goto error;
+  }
+
+  if (image->slice_pitch) {
+    if (!slice_pitch)
+      slice_pitch = row_pitch*region[1];
+    else if (slice_pitch < row_pitch*region[1]) {
+      err = CL_INVALID_VALUE;
+      goto error;
+    }
+  }
+  else if (slice_pitch) {
+     err = CL_INVALID_VALUE;
+     goto error;
+  }
+
+  if (!ptr) {
+     err = CL_INVALID_VALUE;
+     goto error;
+  }
+
+  if (mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+     err = CL_INVALID_OPERATION;
+     goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
+
+  data = &no_wait_data;
+  data->type        = EnqueueReadImage;
+  data->mem_obj     = mem;
+  data->ptr         = ptr;
+  data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] = origin[2];
+  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
+  data->row_pitch   = row_pitch;
+  data->slice_pitch = slice_pitch;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_READ_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(event ? *event : NULL, data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
+
+error:
+  return err;
+}
+
+cl_int
+clEnqueueWriteImage(cl_command_queue     command_queue,
+                    cl_mem               mem,
+                    cl_bool              blocking_write,
+                    const size_t *       porigin,
+                    const size_t *       pregion,
+                    size_t               row_pitch,
+                    size_t               slice_pitch,
+                    const void *         ptr,
+                    cl_uint              num_events_in_wait_list,
+                    const cl_event *     event_wait_list,
+                    cl_event *           event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_IMAGE(mem, image);
+  FIXUP_IMAGE_REGION(image, pregion, region);
+  FIXUP_IMAGE_ORIGIN(image, porigin, origin);
+  if (command_queue->ctx != mem->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!row_pitch)
+    row_pitch = image->bpp*region[0];
+  else if (row_pitch < image->bpp*region[0]) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (image->slice_pitch) {
+    if (!slice_pitch)
+      slice_pitch = row_pitch*region[1];
+    else if (slice_pitch < row_pitch*region[1]) {
+      err = CL_INVALID_VALUE;
+      goto error;
+    }
+  }
+  else if (slice_pitch) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!ptr) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
+
+  data = &no_wait_data;
+  data->type        = EnqueueWriteImage;
+  data->mem_obj     = mem;
+  data->const_ptr   = ptr;
+  data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] = origin[2];
+  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
+  data->row_pitch   = row_pitch;
+  data->slice_pitch = slice_pitch;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_WRITE_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(event ? *event : NULL, data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
+
+error:
+  return err;
+}
+
+cl_int
+clEnqueueCopyImage(cl_command_queue      command_queue,
+                   cl_mem                src_mem,
+                   cl_mem                dst_mem,
+                   const size_t *        psrc_origin,
+                   const size_t *        pdst_origin,
+                   const size_t *        pregion,
+                   cl_uint               num_events_in_wait_list,
+                   const cl_event *      event_wait_list,
+                   cl_event *            event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+  cl_bool overlap = CL_TRUE;
+  cl_int i = 0;
+
+  CHECK_QUEUE(command_queue);
+  CHECK_IMAGE(src_mem, src_image);
+  CHECK_IMAGE(dst_mem, dst_image);
+  FIXUP_IMAGE_REGION(src_image, pregion, region);
+  FIXUP_IMAGE_ORIGIN(src_image, psrc_origin, src_origin);
+  FIXUP_IMAGE_ORIGIN(dst_image, pdst_origin, dst_origin);
+  if (command_queue->ctx != src_mem->ctx ||
+      command_queue->ctx != dst_mem->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (src_image->fmt.image_channel_order != dst_image->fmt.image_channel_order ||
+      src_image->fmt.image_channel_data_type != dst_image->fmt.image_channel_data_type) {
+    err = CL_IMAGE_FORMAT_MISMATCH;
+    goto error;
+  }
+
+  if (!src_origin || !region || src_origin[0] + region[0] > src_image->w ||
+      src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] > src_image->depth) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w ||
+      dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] > dst_image->depth) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) ||
+      (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (src_image == dst_image) {
+    for(i = 0; i < 3; i++)
+      overlap = overlap && (src_origin[i] < dst_origin[i] + region[i])
+                        && (dst_origin[i] < src_origin[i] + region[i]);
+    if(overlap == CL_TRUE) {
+      err = CL_MEM_COPY_OVERLAP;
+      goto error;
+    }
+  }
+
+  cl_mem_kernel_copy_image(command_queue, src_image, dst_image, src_origin, dst_origin, region);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_mem->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueCopyImage;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_COPY_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
+    err = cl_command_queue_flush(command_queue);
+  }
+
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", "", command_queue);
+
+error:
+  return err;
+}
+
+cl_int
+clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
+                           cl_mem            src_mem,
+                           cl_mem            dst_buffer,
+                           const size_t *    psrc_origin,
+                           const size_t *    pregion,
+                           size_t            dst_offset,
+                           cl_uint           num_events_in_wait_list,
+                           const cl_event *  event_wait_list,
+                           cl_event *        event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_IMAGE(src_mem, src_image);
+  CHECK_MEM(dst_buffer);
+  FIXUP_IMAGE_REGION(src_image, pregion, region);
+  FIXUP_IMAGE_ORIGIN(src_image, psrc_origin, src_origin);
+  if (command_queue->ctx != src_mem->ctx ||
+      command_queue->ctx != dst_buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (dst_offset + region[0]*region[1]*region[2]*src_image->bpp > dst_buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!src_origin || !region || src_origin[0] + region[0] > src_image->w ||
+      src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] > src_image->depth) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  cl_mem_copy_image_to_buffer(command_queue, src_image, dst_buffer, src_origin, dst_offset, region);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_mem->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueCopyImageToBuffer;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_COPY_IMAGE_TO_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
+    err = cl_command_queue_flush(command_queue);
+  }
+
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", "", command_queue);
+
+error:
+  return err;
+}
+
+cl_int
+clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
+                           cl_mem            src_buffer,
+                           cl_mem            dst_mem,
+                           size_t            src_offset,
+                           const size_t *    pdst_origin,
+                           const size_t *    pregion,
+                           cl_uint           num_events_in_wait_list,
+                           const cl_event *  event_wait_list,
+                           cl_event *        event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(src_buffer);
+  CHECK_IMAGE(dst_mem, dst_image);
+  FIXUP_IMAGE_REGION(dst_image, pregion, region);
+  FIXUP_IMAGE_ORIGIN(dst_image, pdst_origin, dst_origin);
+  if (command_queue->ctx != src_buffer->ctx ||
+      command_queue->ctx != dst_mem->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (src_offset + region[0]*region[1]*region[2]*dst_image->bpp > src_buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w ||
+      dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] > dst_image->depth) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  cl_mem_copy_buffer_to_image(command_queue, src_buffer, dst_image, src_offset, dst_origin, region);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, dst_mem->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueCopyBufferToImage;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_COPY_BUFFER_TO_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
+    err = cl_command_queue_flush(command_queue);
+  }
+
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", "", command_queue);
+
+error:
+  return err;
+}
+
+static cl_int _cl_map_mem(cl_mem mem, void *ptr, void **mem_ptr,
+                          size_t offset, size_t size,
+                          const size_t *origin, const size_t *region)
+{
+  cl_int slot = -1;
+  int err = CL_SUCCESS;
+  size_t sub_offset = 0;
+
+  if(mem->type == CL_MEM_SUBBUFFER_TYPE) {
+    struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+    sub_offset = buffer->sub_offset;
+  }
+
+  ptr = (char*)ptr + offset + sub_offset;
+  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+    assert(mem->host_ptr);
+    //only calc ptr here, will do memcpy in enqueue
+    *mem_ptr = (char *)mem->host_ptr + offset + sub_offset;
+  } else {
+    *mem_ptr = ptr;
+  }
+  /* Record the mapped address. */
+  if (!mem->mapped_ptr_sz) {
+    mem->mapped_ptr_sz = 16;
+    mem->mapped_ptr = (cl_mapped_ptr *)malloc(
+          sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz);
+    if (!mem->mapped_ptr) {
+      cl_mem_unmap_auto(mem);
+      err = CL_OUT_OF_HOST_MEMORY;
+      goto error;
+    }
+    memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+    slot = 0;
+  } else {
+   int i = 0;
+    for (; i < mem->mapped_ptr_sz; i++) {
+      if (mem->mapped_ptr[i].ptr == NULL) {
+        slot = i;
+        break;
+      }
+   }
+    if (i == mem->mapped_ptr_sz) {
+      cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+          sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2);
+      if (!new_ptr) {
+        cl_mem_unmap_auto(mem);
+        err = CL_OUT_OF_HOST_MEMORY;
+        goto error;
+      }
+      memset(new_ptr, 0, 2 * mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+      memcpy(new_ptr, mem->mapped_ptr,
+             mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
+      slot = mem->mapped_ptr_sz;
+      mem->mapped_ptr_sz *= 2;
+      free(mem->mapped_ptr);
+      mem->mapped_ptr = new_ptr;
+    }
+  }
+  assert(slot != -1);
+  mem->mapped_ptr[slot].ptr = *mem_ptr;
+  mem->mapped_ptr[slot].v_ptr = ptr;
+  mem->mapped_ptr[slot].size = size;
+  if(origin) {
+    assert(region);
+    mem->mapped_ptr[slot].origin[0] = origin[0];
+    mem->mapped_ptr[slot].origin[1] = origin[1];
+    mem->mapped_ptr[slot].origin[2] = origin[2];
+    mem->mapped_ptr[slot].region[0] = region[0];
+    mem->mapped_ptr[slot].region[1] = region[1];
+    mem->mapped_ptr[slot].region[2] = region[2];
+  }
+  mem->map_ref++;
+error:
+  if (err != CL_SUCCESS)
+    *mem_ptr = NULL;
+  return err;
+}
+
+void *
+clEnqueueMapBuffer(cl_command_queue  command_queue,
+                   cl_mem            buffer,
+                   cl_bool           blocking_map,
+                   cl_map_flags      map_flags,
+                   size_t            offset,
+                   size_t            size,
+                   cl_uint           num_events_in_wait_list,
+                   const cl_event *  event_wait_list,
+                   cl_event *        event,
+                   cl_int *          errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  void *ptr = NULL;
+  void *mem_ptr = NULL;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(buffer);
+  if (command_queue->ctx != buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (!size || offset + size > buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((map_flags & CL_MAP_READ &&
+       buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
+      (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
+       buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
+  {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+  data = &no_wait_data;
+  data->type        = EnqueueMapBuffer;
+  data->mem_obj     = buffer;
+  data->offset      = offset;
+  data->size        = size;
+  data->ptr         = ptr;
+  data->unsync_map  = 1;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_MAP_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    data->unsync_map = 0;
+    err = cl_enqueue_handle(event ? *event : NULL, data);
+    if (err != CL_SUCCESS)
+      goto error;
+    ptr = data->ptr;
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  } else {
+    if ((ptr = cl_mem_map_gtt_unsync(buffer)) == NULL) {
+      err = CL_MAP_FAILURE;
+      goto error;
+    }
+  }
+  err = _cl_map_mem(buffer, ptr, &mem_ptr, offset, size, NULL, NULL);
+  if (err != CL_SUCCESS)
+    goto error;
+
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem_ptr;
+}
+
+void *
+clEnqueueMapImage(cl_command_queue   command_queue,
+                  cl_mem             mem,
+                  cl_bool            blocking_map,
+                  cl_map_flags       map_flags,
+                  const size_t *     porigin,
+                  const size_t *     pregion,
+                  size_t *           image_row_pitch,
+                  size_t *           image_slice_pitch,
+                  cl_uint            num_events_in_wait_list,
+                  const cl_event *   event_wait_list,
+                  cl_event *         event,
+                  cl_int *           errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  void *ptr  = NULL;
+  void *mem_ptr = NULL;
+  size_t offset = 0;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_IMAGE(mem, image);
+  FIXUP_IMAGE_REGION(image, pregion, region);
+  FIXUP_IMAGE_ORIGIN(image, porigin, origin);
+  if (command_queue->ctx != mem->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!image_row_pitch || (image->slice_pitch && !image_slice_pitch)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((map_flags & CL_MAP_READ &&
+       mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) ||
+      (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) &&
+       mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)))
+  {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx);
+
+  data = &no_wait_data;
+  data->type        = EnqueueMapImage;
+  data->mem_obj     = mem;
+  data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] = origin[2];
+  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
+  data->ptr         = ptr;
+  data->unsync_map  = 1;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_MAP_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
+    data->unsync_map = 0;
+    err = cl_enqueue_handle(event ? *event : NULL, data);
+    if (err != CL_SUCCESS)
+      goto error;
+    ptr = data->ptr;
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  } else {
+    if ((ptr = cl_mem_map_gtt_unsync(mem)) == NULL) {
+      err = CL_MAP_FAILURE;
+      goto error;
+    }
+  }
+
+  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+    if (image_slice_pitch)
+      *image_slice_pitch = image->host_slice_pitch;
+    *image_row_pitch = image->host_row_pitch;
+
+    offset = image->bpp*origin[0] + image->host_row_pitch*origin[1] + image->host_slice_pitch*origin[2];
+  } else {
+    if (image_slice_pitch)
+      *image_slice_pitch = image->slice_pitch;
+    if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+      *image_row_pitch = image->slice_pitch;
+    else
+      *image_row_pitch = image->row_pitch;
+
+    offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+  }
+  err = _cl_map_mem(mem, ptr, &mem_ptr, offset, 0, origin, region);
+
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem_ptr; //TODO: map and unmap first
+}
+
+cl_int
+clEnqueueUnmapMemObject(cl_command_queue  command_queue,
+                        cl_mem            memobj,
+                        void *            mapped_ptr,
+                        cl_uint           num_events_in_wait_list,
+                        const cl_event *  event_wait_list,
+                        cl_event *        event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(memobj);
+  if (command_queue->ctx != memobj->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, memobj->ctx);
+
+  data = &no_wait_data;
+  data->type        = EnqueueUnmapMemObject;
+  data->mem_obj     = memobj;
+  data->ptr         = mapped_ptr;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_UNMAP_MEM_OBJECT) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(event ? *event : NULL, data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
+
+error:
+  return err;
+}
+
+cl_int
+clEnqueueMigrateMemObjects(cl_command_queue        command_queue,
+                           cl_uint                 num_mem_objects,
+                           const cl_mem *          mem_objects,
+                           cl_mem_migration_flags  flags,
+                           cl_uint                 num_events_in_wait_list,
+                           const cl_event *        event_wait_list,
+                           cl_event *              event)
+{
+  /* So far, we just support 1 device and no subdevice. So all the command queues
+     belong to the small context. There is no need to migrate the mem objects by now. */
+  cl_int err = CL_SUCCESS;
+  cl_uint i = 0;
+  enqueue_data *data, defer_enqueue_data = { 0 };
+
+  if (!flags & CL_MIGRATE_MEM_OBJECT_HOST)
+    CHECK_QUEUE(command_queue);
+
+  if (num_mem_objects == 0 || mem_objects == NULL) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (flags && flags & ~(CL_MIGRATE_MEM_OBJECT_HOST |
+                         CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  for (i = 0; i < num_mem_objects; i++) {
+    CHECK_MEM(mem_objects[i]);
+    if (mem_objects[i]->ctx != command_queue->ctx) {
+      err = CL_INVALID_CONTEXT;
+      goto error;
+    }
+  }
+
+  /* really nothing to do, fill the event. */
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+  data = &defer_enqueue_data;
+  data->type = EnqueueMigrateMemObj;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(event ? *event : NULL, data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
+
+error:
+  return err;
+}
+
+cl_int
+clEnqueueNDRangeKernel(cl_command_queue  command_queue,
+                       cl_kernel         kernel,
+                       cl_uint           work_dim,
+                       const size_t *    global_work_offset,
+                       const size_t *    global_work_size,
+                       const size_t *    local_work_size,
+                       cl_uint           num_events_in_wait_list,
+                       const cl_event *  event_wait_list,
+                       cl_event *        event)
+{
+  size_t fixed_global_off[] = {0,0,0};
+  size_t fixed_global_sz[] = {1,1,1};
+  size_t fixed_local_sz[] = {1,1,1};
+  cl_int err = CL_SUCCESS;
+  cl_uint i;
+  enqueue_data *data, no_wait_data = { 0 };
+
+  CHECK_QUEUE(command_queue);
+  CHECK_KERNEL(kernel);
+
+  /* Check number of dimensions we have */
+  if (UNLIKELY(work_dim == 0 || work_dim > 3)) {
+    err = CL_INVALID_WORK_DIMENSION;
+    goto error;
+  }
+
+  /* We need a work size per dimension */
+  if (UNLIKELY(global_work_size == NULL)) {
+    err = CL_INVALID_GLOBAL_WORK_SIZE;
+    goto error;
+  }
+
+  if (global_work_offset != NULL)
+    for (i = 0; i < work_dim; ++i) {
+      if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) {
+        err = CL_INVALID_GLOBAL_OFFSET;
+        goto error;
+      }
+    }
+
+  /* Local sizes must be non-null and divide global sizes */
+  if (local_work_size != NULL)
+    for (i = 0; i < work_dim; ++i)
+      if (UNLIKELY(local_work_size[i] == 0 || global_work_size[i] % local_work_size[i])) {
+        err = CL_INVALID_WORK_GROUP_SIZE;
+        goto error;
+      }
+
+  /* Queue and kernel must share the same context */
+  assert(kernel->program);
+  if (command_queue->ctx != kernel->program->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+
+  /* XXX No event right now */
+  //FATAL_IF(num_events_in_wait_list > 0, "Events are not supported");
+  //FATAL_IF(event_wait_list != NULL, "Events are not supported");
+  //FATAL_IF(event != NULL, "Events are not supported");
+
+  if (local_work_size != NULL) {
+    for (i = 0; i < work_dim; ++i)
+      fixed_local_sz[i] = local_work_size[i];
+  } else {
+    uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
+    for (i = 0; i< work_dim; i++) {
+      for (j = maxDimSize; j > 1; j--) {
+        if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
+          fixed_local_sz[i] = j;
+          maxGroupSize = maxGroupSize /j;
+          maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
+          break;  //choose next work_dim
+        }
+      }
+    }
+  }
+
+  if (global_work_size != NULL)
+    for (i = 0; i < work_dim; ++i)
+      fixed_global_sz[i] = global_work_size[i];
+  if (global_work_offset != NULL)
+    for (i = 0; i < work_dim; ++i)
+      fixed_global_off[i] = global_work_offset[i];
+
+  if (kernel->compile_wg_sz[0] || kernel->compile_wg_sz[1] || kernel->compile_wg_sz[2]) {
+    if (fixed_local_sz[0] != kernel->compile_wg_sz[0]
+        || fixed_local_sz[1] != kernel->compile_wg_sz[1]
+        || fixed_local_sz[2] != kernel->compile_wg_sz[2])
+    {
+        err = CL_INVALID_WORK_GROUP_SIZE;
+        goto error;
+    }
+  }
+
+  /* Do device specific checks are enqueue the kernel */
+  err = cl_command_queue_ND_range(command_queue,
+                                  kernel,
+                                  work_dim,
+                                  fixed_global_off,
+                                  fixed_global_sz,
+                                  fixed_local_sz);
+  if(err != CL_SUCCESS)
+    goto error;
+
+  data = &no_wait_data;
+  data->type = EnqueueNDRangeKernel;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_NDRANGE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
+    err = cl_command_queue_flush(command_queue);
+  }
+
+  if(b_output_kernel_perf)
+  {
+    if(kernel->program->build_opts != NULL)
+      time_end(command_queue->ctx, cl_kernel_get_name(kernel), kernel->program->build_opts, command_queue);
+    else
+      time_end(command_queue->ctx, cl_kernel_get_name(kernel), "", command_queue);
+  }
+error:
+  return err;
+}
+
+cl_int
+clEnqueueTask(cl_command_queue   command_queue,
+              cl_kernel          kernel,
+              cl_uint            num_events_in_wait_list,
+              const cl_event *   event_wait_list,
+              cl_event *         event)
+{
+  const size_t global_size[3] = {1, 0, 0};
+  const size_t local_size[3]  = {1, 0, 0};
+
+  return clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_size, local_size,
+                                num_events_in_wait_list, event_wait_list, event);
+}
+
+cl_int
+clEnqueueNativeKernel(cl_command_queue   command_queue,
+                      void (*user_func)(void *),
+                      void *             args,
+                      size_t             cb_args,
+                      cl_uint            num_mem_objects,
+                      const cl_mem *     mem_list,
+                      const void **      args_mem_loc,
+                      cl_uint            num_events_in_wait_list,
+                      const cl_event *   event_wait_list,
+                      cl_event *         event)
+{
+  cl_int err = CL_SUCCESS;
+  void *new_args = NULL;
+  enqueue_data *data, no_wait_data = { 0 };
+  cl_int i;
+
+  if(user_func == NULL ||
+    (args == NULL && cb_args > 0) ||
+    (args == NULL && num_mem_objects ==0) ||
+    (args != NULL && cb_args == 0) ||
+    (num_mem_objects > 0 && (mem_list == NULL || args_mem_loc == NULL)) ||
+    (num_mem_objects == 0 && (mem_list != NULL || args_mem_loc != NULL))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  //Per spec, need copy args
+  if (cb_args)
+  {
+    new_args = malloc(cb_args);
+    if (!new_args)
+    {
+      err = CL_OUT_OF_HOST_MEMORY;
+      goto error;
+    }
+    memcpy(new_args, args, cb_args);
+
+    for (i=0; i<num_mem_objects; ++i)
+    {
+      CHECK_MEM(mem_list[i]);
+      args_mem_loc[i] = new_args + (args_mem_loc[i] - args);  //change to new args
+    }
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+
+  data = &no_wait_data;
+  data->type        = EnqueueNativeKernel;
+  data->mem_list    = mem_list;
+  data->ptr         = new_args;
+  data->size        = cb_args;
+  data->offset      = (size_t)num_mem_objects;
+  data->const_ptr   = args_mem_loc;
+  data->user_func   = user_func;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_NATIVE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) {
+    err = cl_enqueue_handle(event ? *event : NULL, data);
+    if(event) cl_event_set_status(*event, CL_COMPLETE);
+  }
+
+error:
+  return err;
+}
+
+cl_int
+clEnqueueMarker(cl_command_queue command_queue,
+    cl_event *event)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_QUEUE(command_queue);
+  if(event == NULL) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  cl_event_marker_with_wait_list(command_queue, 0, NULL, event);
+error:
+  return err;
+}
+
+cl_int
+clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
+    cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_QUEUE(command_queue);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+
+  cl_event_marker_with_wait_list(command_queue, num_events_in_wait_list, event_wait_list, event);
+error:
+  return err;
+}
+
+cl_int
+clEnqueueWaitForEvents(cl_command_queue  command_queue,
+                       cl_uint           num_events,
+                       const cl_event *  event_list)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_QUEUE(command_queue);
+  err = clWaitForEvents(num_events, event_list);
+
+error:
+  return err;
+}
+
+cl_int
+clEnqueueBarrier(cl_command_queue  command_queue)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_QUEUE(command_queue);
+
+  cl_event_barrier_with_wait_list(command_queue, 0, NULL, NULL);
+
+error:
+  return err;
+}
+
+cl_int
+clEnqueueBarrierWithWaitList(cl_command_queue command_queue,
+    cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_QUEUE(command_queue);
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx);
+
+  cl_event_barrier_with_wait_list(command_queue, num_events_in_wait_list, event_wait_list, event);
+error:
+  return err;
+}
+
+#define EXTFUNC(x)                      \
+  if (strcmp(#x, func_name) == 0)       \
+    return (void *)x;
+
+static void*
+internal_clGetExtensionFunctionAddress(const char *func_name)
+{
+  if (func_name == NULL)
+    return NULL;
+#ifdef HAS_OCLIcd
+  /* cl_khr_icd */
+  EXTFUNC(clIcdGetPlatformIDsKHR)
+#endif
+  EXTFUNC(clCreateProgramWithLLVMIntel)
+  EXTFUNC(clGetGenVersionIntel)
+  EXTFUNC(clMapBufferIntel)
+  EXTFUNC(clUnmapBufferIntel)
+  EXTFUNC(clMapBufferGTTIntel)
+  EXTFUNC(clUnmapBufferGTTIntel)
+  EXTFUNC(clPinBufferIntel)
+  EXTFUNC(clUnpinBufferIntel)
+  EXTFUNC(clReportUnfreedIntel)
+  EXTFUNC(clCreateBufferFromLibvaIntel)
+  EXTFUNC(clCreateImageFromLibvaIntel)
+  EXTFUNC(clGetMemObjectFdIntel)
+  return NULL;
+}
+
+void*
+clGetExtensionFunctionAddress(const char *func_name)
+{
+  return internal_clGetExtensionFunctionAddress(func_name);
+}
+
+void*
+clGetExtensionFunctionAddressForPlatform(cl_platform_id platform,
+                              const char *func_name)
+{
+  if (UNLIKELY(platform != NULL && platform != intel_platform))
+    return NULL;
+  return internal_clGetExtensionFunctionAddress(func_name);
+}
+
+#undef EXTFUNC
+
+cl_int
+clReportUnfreedIntel(void)
+{
+  return cl_report_unfreed();
+}
+
+void*
+clMapBufferIntel(cl_mem mem, cl_int *errcode_ret)
+{
+  void *ptr = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_MEM (mem);
+  ptr = cl_mem_map(mem);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return ptr;
+}
+
+cl_int
+clUnmapBufferIntel(cl_mem mem)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_MEM (mem);
+  err = cl_mem_unmap(mem);
+error:
+  return err;
+}
+
+void*
+clMapBufferGTTIntel(cl_mem mem, cl_int *errcode_ret)
+{
+  void *ptr = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_MEM (mem);
+  ptr = cl_mem_map_gtt(mem);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return ptr;
+}
+
+cl_int
+clUnmapBufferGTTIntel(cl_mem mem)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_MEM (mem);
+  err = cl_mem_unmap_gtt(mem);
+error:
+  return err;
+}
+
+cl_int
+clPinBufferIntel(cl_mem mem)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_MEM (mem);
+  cl_mem_pin(mem);
+error:
+  return err;
+}
+
+cl_int
+clUnpinBufferIntel(cl_mem mem)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_MEM (mem);
+  cl_mem_unpin(mem);
+error:
+  return err;
+}
+
+cl_int
+clGetGenVersionIntel(cl_device_id device, cl_int *ver)
+{
+  return cl_device_get_version(device, ver);
+}
+
+cl_program
+clCreateProgramWithLLVMIntel(cl_context              context,
+                             cl_uint                 num_devices,
+                             const cl_device_id *    devices,
+                             const char *            filename,
+                             cl_int *                errcode_ret)
+{
+  return cl_program_create_from_llvm(context,
+                                     num_devices,
+                                     devices,
+                                     filename,
+                                     errcode_ret);
+}
+
+cl_mem
+clCreateBufferFromLibvaIntel(cl_context  context,
+                             unsigned int bo_name,
+                             cl_int *errorcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+
+  mem = cl_mem_new_libva_buffer(context, bo_name, &err);
+
+error:
+  if (errorcode_ret)
+    *errorcode_ret = err;
+  return mem;
+}
+
+cl_mem
+clCreateImageFromLibvaIntel(cl_context context,
+                            const cl_libva_image *info,
+                            cl_int *errorcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+
+  if (!info) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  mem = cl_mem_new_libva_image(context,
+                               info->bo_name, info->offset, info->width, info->height,
+                               info->fmt, info->row_pitch,
+                               &err);
+
+error:
+  if (errorcode_ret)
+    *errorcode_ret = err;
+  return mem;
+}
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectFdIntel(cl_context context,
+                      cl_mem memobj,
+                      int* fd)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+  CHECK_MEM (memobj);
+
+  err = cl_mem_get_fd(memobj, fd);
+
+error:
+  return err;
+}
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
new file mode 100644
index 0000000..0be37a7
--- /dev/null
+++ b/src/cl_command_queue.c
@@ -0,0 +1,622 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "program.h" // for BTI_MAX_IMAGE_NUM
+#include "cl_command_queue.h"
+#include "cl_context.h"
+#include "cl_program.h"
+#include "cl_kernel.h"
+#include "cl_device_id.h"
+#include "cl_mem.h"
+#include "cl_utils.h"
+#include "cl_thread.h"
+#include "cl_alloc.h"
+#include "cl_driver.h"
+#include "cl_khr_icd.h"
+#include "cl_event.h"
+#include "performance.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+LOCAL cl_command_queue
+cl_command_queue_new(cl_context ctx)
+{
+  cl_command_queue queue = NULL;
+
+  assert(ctx);
+  TRY_ALLOC_NO_ERR (queue, CALLOC(struct _cl_command_queue));
+  SET_ICD(queue->dispatch)
+  queue->magic = CL_MAGIC_QUEUE_HEADER;
+  queue->ref_n = 1;
+  queue->ctx = ctx;
+  if ((queue->thread_data = cl_thread_data_create()) == NULL) {
+    goto error;
+  }
+
+  /* Append the command queue in the list */
+  pthread_mutex_lock(&ctx->queue_lock);
+    queue->next = ctx->queues;
+    if (ctx->queues != NULL)
+      ctx->queues->prev = queue;
+    ctx->queues = queue;
+  pthread_mutex_unlock(&ctx->queue_lock);
+
+  /* The queue also belongs to its context */
+  cl_context_add_ref(ctx);
+
+exit:
+  return queue;
+error:
+  cl_command_queue_delete(queue);
+  queue = NULL;
+  goto exit;
+}
+
+LOCAL void
+cl_command_queue_delete(cl_command_queue queue)
+{
+  assert(queue);
+  if (atomic_dec(&queue->ref_n) != 1) return;
+
+  // If there is a valid last event, we need to give it a chance to
+  // call the call-back function.
+  if (queue->last_event && queue->last_event->user_cb)
+    cl_event_update_status(queue->last_event, 1);
+  /* Remove it from the list */
+  assert(queue->ctx);
+  pthread_mutex_lock(&queue->ctx->queue_lock);
+    if (queue->prev)
+      queue->prev->next = queue->next;
+    if (queue->next)
+      queue->next->prev = queue->prev;
+    if (queue->ctx->queues == queue)
+      queue->ctx->queues = queue->next;
+  pthread_mutex_unlock(&queue->ctx->queue_lock);
+  if (queue->fulsim_out != NULL) {
+    cl_mem_delete(queue->fulsim_out);
+    queue->fulsim_out = NULL;
+  }
+
+  cl_thread_data_destroy(queue);
+  queue->thread_data = NULL;
+  cl_mem_delete(queue->perf);
+  cl_context_delete(queue->ctx);
+  cl_free(queue->wait_events);
+  queue->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+  cl_free(queue);
+}
+
+LOCAL void
+cl_command_queue_add_ref(cl_command_queue queue)
+{
+  atomic_inc(&queue->ref_n);
+}
+
+static void
+set_image_info(char *curbe,
+               struct ImageInfo * image_info,
+               struct _cl_mem_image *image)
+{
+  if (image_info->wSlot >= 0)
+    *(uint32_t*)(curbe + image_info->wSlot) = image->w;
+  if (image_info->hSlot >= 0)
+    *(uint32_t*)(curbe + image_info->hSlot) = image->h;
+  if (image_info->depthSlot >= 0)
+    *(uint32_t*)(curbe + image_info->depthSlot) = image->depth;
+  if (image_info->channelOrderSlot >= 0)
+    *(uint32_t*)(curbe + image_info->channelOrderSlot) = image->fmt.image_channel_order;
+  if (image_info->dataTypeSlot >= 0)
+    *(uint32_t*)(curbe + image_info->dataTypeSlot) = image->fmt.image_channel_data_type;
+}
+
+LOCAL cl_int
+cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
+{
+  uint32_t i;
+  GET_QUEUE_THREAD_GPGPU(queue);
+
+  for (i = 0; i < k->image_sz; i++) {
+    int id = k->images[i].arg_idx;
+    struct _cl_mem_image *image;
+    assert(interp_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
+    image = cl_mem_image(k->args[id].mem);
+    set_image_info(k->curbe, &k->images[i], image);
+    cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
+                        image->intel_fmt, image->image_type,
+                        image->w, image->h, image->depth,
+                        image->row_pitch, (cl_gpgpu_tiling)image->tiling);
+    // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer
+    // on demand.
+    if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+      cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_MAX_IMAGE_NUM, image->base.bo, image->offset,
+                          image->intel_fmt, image->image_type,
+                          image->w, image->h, image->depth,
+                          image->row_pitch, image->tiling);
+  }
+  return CL_SUCCESS;
+}
+
+LOCAL cl_int
+cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
+{
+  GET_QUEUE_THREAD_GPGPU(queue);
+
+  /* Bind all user buffers (given by clSetKernelArg) */
+  uint32_t i;
+  enum gbe_arg_type arg_type; /* kind of argument */
+  for (i = 0; i < k->arg_n; ++i) {
+    uint32_t offset; // location of the address in the curbe
+    arg_type = interp_kernel_get_arg_type(k->opaque, i);
+    if (arg_type != GBE_ARG_GLOBAL_PTR || !k->args[i].mem)
+      continue;
+    offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
+    if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
+      struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
+    } else {
+      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, k->args[i].mem->size, interp_kernel_get_arg_bti(k->opaque, i));
+    }
+  }
+
+  return CL_SUCCESS;
+}
+
+
+#if USE_FULSIM
+extern void drm_intel_bufmgr_gem_stop_aubfile(cl_buffer_mgr);
+extern void drm_intel_bufmgr_gem_set_aubfile(cl_buffer_mgr, FILE*);
+extern void aub_exec_dump_raw_file(cl_buffer, size_t offset, size_t sz);
+
+static void
+cl_run_fulsim(void)
+{
+  const char *run_it = getenv("OCL_SIMULATOR");
+  const char *debug_mode = getenv("OCL_FULSIM_DEBUG_MODE");
+  if (run_it == NULL || strcmp(run_it, "1")) return;
+
+#if EMULATE_GEN == 7 /* IVB */
+  if (debug_mode == NULL || strcmp(debug_mode, "1"))
+    system("wine AubLoad.exe dump.aub -device ivbB0");
+  else
+    system("wine AubLoad.exe dump.aub -device ivbB0 -debug");
+#elif EMULATE_GEN == 75 /* HSW */
+  if (debug_mode == NULL || strcmp(debug_mode, "1"))
+    system("wine AubLoad.exe dump.aub -device hsw.h.a0");
+  else
+    system("wine AubLoad.exe dump.aub -device hsw.h.a0 -debug");
+#else
+#error "Unknown device"
+#endif
+}
+
+/* Each buffer is dump using several chunks of this size */
+static const size_t chunk_sz = 8192u;
+
+static cl_int
+cl_fulsim_dump_all_surfaces(cl_command_queue queue, cl_kernel k)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+  int i;
+  size_t j;
+
+  /* Bind user defined surface */
+  for (i = 0; i < k->arg_n; ++i) {
+    size_t chunk_n, chunk_remainder;
+    if (interp_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR)
+      continue;
+    mem = (cl_mem) k->args[i].mem;
+    CHECK_MEM(mem);
+    chunk_n = cl_buffer_get_size(mem->bo) / chunk_sz;
+    chunk_remainder = cl_buffer_get_size(mem->bo) % chunk_sz;
+    for (j = 0; j < chunk_n; ++j)
+      aub_exec_dump_raw_file(mem->bo, j * chunk_sz, chunk_sz);
+    if (chunk_remainder)
+      aub_exec_dump_raw_file(mem->bo, chunk_n * chunk_sz, chunk_remainder);
+  }
+error:
+  return err;
+}
+
+struct bmphdr {
+  /* 2 bytes of magic here, "BM", total header size is 54 bytes! */
+  int filesize;      /*  4 total file size incl header */
+  short as0, as1;    /*  8 app specific */
+  int bmpoffset;     /* 12 ofset of bmp data  */
+  int headerbytes;   /* 16 bytes in header from this point (40 actually) */
+  int width;         /* 20  */
+  int height;        /* 24  */
+  short nplanes;     /* 26 no of color planes */
+  short bpp;         /* 28 bits/pixel */
+  int compression;   /* 32 BI_RGB = 0 = no compression */
+  int sizeraw;       /* 36 size of raw bmp file, excluding header, incl padding */
+  int hres;          /* 40 horz resolutions pixels/meter */
+  int vres;          /* 44 */
+  int npalcolors;    /* 48 No of colors in palette */
+  int nimportant;    /* 52 No of important colors */
+  /* raw b, g, r data here, dword aligned per scan line */
+};
+
+static int*
+cl_read_bmp(const char *filename, int *width, int *height)
+{
+  int n;
+  struct bmphdr hdr;
+
+  FILE *fp = fopen(filename, "rb");
+  assert(fp);
+
+  char magic[2];
+  n = fread(&magic[0], 1, 2, fp);
+  assert(n == 2 && magic[0] == 'B' && magic[1] == 'M');
+
+  n = fread(&hdr, 1, sizeof(hdr), fp);
+  assert(n == sizeof(hdr));
+
+  assert(hdr.width > 0 &&
+         hdr.height > 0 &&
+         hdr.nplanes == 1
+         && hdr.compression == 0);
+
+  int *rgb32 = (int *) cl_malloc(hdr.width * hdr.height * sizeof(int));
+  assert(rgb32);
+  int x, y;
+
+  int *dst = rgb32;
+  for (y = 0; y < hdr.height; y++) {
+    for (x = 0; x < hdr.width; x++) {
+      assert(!feof(fp));
+      int b = (getc(fp) & 0x0ff);
+      int g = (getc(fp) & 0x0ff);
+      int r = (getc(fp) & 0x0ff);
+      *dst++ = (r | (g << 8) | (b << 16) | 0xff000000);	/* abgr */
+    }
+    while (x & 3) {
+      getc(fp);
+      x++;
+    }
+  }
+  fclose(fp);
+  *width = hdr.width;
+  *height = hdr.height;
+  return rgb32;
+}
+
+static char*
+cl_read_dump(const char *name, size_t *size)
+{
+  char *raw = NULL, *dump = NULL;
+  size_t i, sz;
+  int w, h;
+  if ((raw = (char*) cl_read_bmp(name, &w, &h)) == NULL)
+    return NULL;
+  sz = w * h;
+  dump = (char*) cl_malloc(sz);
+  assert(dump);
+  for (i = 0; i < sz; ++i)
+    dump[i] = raw[4*i];
+  cl_free(raw);
+  if (size)
+    *size = sz;
+  return dump;
+}
+
+static cl_int
+cl_fulsim_read_all_surfaces(cl_command_queue queue, cl_kernel k)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+  char *from = NULL, *to = NULL;
+  size_t size, j, chunk_n, chunk_remainder;
+  int i, curr = 0;
+  /* Bind user defined surface */
+  for (i = 0; i < k->arg_n; ++i) {
+    if (interp_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR)
+      continue;
+    mem = (cl_mem) k->args[i].mem;
+    CHECK_MEM(mem);
+    assert(mem->bo);
+    chunk_n = cl_buffer_get_size(mem->bo) / chunk_sz;
+    chunk_remainder = cl_buffer_get_size(mem->bo) % chunk_sz;
+    to = cl_mem_map(mem);
+    for (j = 0; j < chunk_n; ++j) {
+      char name[256];
+      sprintf(name, "dump%03i.bmp", curr);
+#ifdef NDEBUG
+      from = cl_read_dump(name, NULL);
+#else
+      from = cl_read_dump(name, &size);
+      assert(size == chunk_sz);
+#endif /* NDEBUG */
+      memcpy(to + j*chunk_sz, from, chunk_sz);
+      cl_free(from);
+      curr++;
+    }
+    if (chunk_remainder) {
+      char name[256];
+      sprintf(name, "dump%03i.bmp", curr);
+#ifdef NDEBUG
+      from = cl_read_dump(name, NULL);
+#else
+      from = cl_read_dump(name, &size);
+      assert(size == chunk_remainder);
+#endif /* NDEBUG */
+      memcpy(to + chunk_n*chunk_sz, from, chunk_remainder);
+      cl_free(from);
+      curr++;
+    }
+    cl_mem_unmap(mem);
+  }
+error:
+  return err;
+}
+#endif
+
+extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *);
+
+static cl_int
+cl_kernel_check_args(cl_kernel k)
+{
+  uint32_t i;
+  for (i = 0; i < k->arg_n; ++i)
+    if (k->args[i].is_set == CL_FALSE)
+      return CL_INVALID_KERNEL_ARGS;
+  return CL_SUCCESS;
+}
+
+LOCAL cl_int
+cl_command_queue_ND_range(cl_command_queue queue,
+                          cl_kernel k,
+                          const uint32_t work_dim,
+                          const size_t *global_wk_off,
+                          const size_t *global_wk_sz,
+                          const size_t *local_wk_sz)
+{
+  if(b_output_kernel_perf)
+    time_start(queue->ctx, cl_kernel_get_name(k), queue);
+  const int32_t ver = cl_driver_get_ver(queue->ctx->drv);
+  cl_int err = CL_SUCCESS;
+
+  /* Check that the user did not forget any argument */
+  TRY (cl_kernel_check_args, k);
+
+#if USE_FULSIM
+  cl_buffer_mgr bufmgr = NULL;
+  FILE *file = NULL;
+  const char *run_it = getenv("OCL_SIMULATOR");
+  if (run_it != NULL && strcmp(run_it, "1") == 0) {
+    file = fopen("dump.aub", "wb");
+    FATAL_IF (file == NULL, "Unable to open file dump.aub");
+    bufmgr = cl_context_get_bufmgr(queue->ctx);
+    drm_intel_bufmgr_gem_set_aubfile(bufmgr, file);
+  }
+#endif /* USE_FULSIM */
+
+  if (ver == 7 || ver == 75)
+    TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
+  else
+    FATAL ("Unknown Gen Device");
+
+#if USE_FULSIM
+  if (run_it != NULL && strcmp(run_it, "1") == 0) {
+    TRY (cl_fulsim_dump_all_surfaces, queue, k);
+    drm_intel_bufmgr_gem_stop_aubfile(bufmgr);
+    fclose(file);
+    cl_run_fulsim();
+    TRY (cl_fulsim_read_all_surfaces, queue, k);
+  }
+#endif /* USE_FULSIM */
+
+error:
+  return err;
+}
+
+LOCAL void
+cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu)
+{
+  size_t global_wk_sz[3];
+  void* printf_info = cl_gpgpu_get_printf_info(gpgpu, global_wk_sz);
+
+  cl_gpgpu_flush(gpgpu);
+
+  if (printf_info && interp_get_printf_num(printf_info)) {
+    void *index_addr = cl_gpgpu_map_printf_buffer(gpgpu, 0);
+    void *buf_addr = NULL;
+    if (interp_get_printf_sizeof_size(printf_info))
+      buf_addr = cl_gpgpu_map_printf_buffer(gpgpu, 1);
+
+    interp_output_printf(printf_info, index_addr, buf_addr, global_wk_sz[0],
+                      global_wk_sz[1], global_wk_sz[2]);
+
+    cl_gpgpu_unmap_printf_buffer(gpgpu, 0);
+    if (interp_get_printf_sizeof_size(printf_info))
+      cl_gpgpu_unmap_printf_buffer(gpgpu, 1);
+  }
+
+  if (printf_info) {
+    interp_release_printf_info(printf_info);
+    global_wk_sz[0] = global_wk_sz[1] = global_wk_sz[2] = 0;
+    cl_gpgpu_set_printf_info(gpgpu, NULL, global_wk_sz);
+  }
+}
+
+LOCAL cl_int
+cl_command_queue_flush(cl_command_queue queue)
+{
+  GET_QUEUE_THREAD_GPGPU(queue);
+  cl_command_queue_flush_gpgpu(queue, gpgpu);
+  // As we don't have a deadicate timer thread to take care the possible
+  // event which has a call back function registerred and the event will
+  // be released at the call back function, no other function will access
+  // the event any more. If we don't do this here, we will leak that event
+  // and all the corresponding buffers which is really bad.
+  if (queue->last_event && queue->last_event->user_cb)
+    cl_event_update_status(queue->last_event, 1);
+  if (queue->current_event)
+    cl_event_flush(queue->current_event);
+  cl_invalid_thread_gpgpu(queue);
+  return CL_SUCCESS;
+}
+
+LOCAL cl_int
+cl_command_queue_finish(cl_command_queue queue)
+{
+  cl_gpgpu_sync(cl_get_thread_batch_buf(queue));
+  return CL_SUCCESS;
+}
+
+#define DEFAULT_WAIT_EVENTS_SIZE  16
+LOCAL void
+cl_command_queue_insert_event(cl_command_queue queue, cl_event event)
+{
+  cl_int i=0;
+  cl_event *new_list;
+
+  assert(queue != NULL);
+  if(queue->wait_events == NULL) {
+    queue->wait_events_size = DEFAULT_WAIT_EVENTS_SIZE;
+    TRY_ALLOC_NO_ERR (queue->wait_events, CALLOC_ARRAY(cl_event, queue->wait_events_size));
+  }
+
+  for(i=0; i<queue->wait_events_num; i++) {
+    if(queue->wait_events[i] == event)
+      return;   //is in the wait_events, need to insert
+  }
+
+  if(queue->wait_events_num < queue->wait_events_size) {
+    queue->wait_events[queue->wait_events_num++] = event;
+    return;
+  }
+
+  //wait_events_num == wait_events_size, array is full
+  queue->wait_events_size *= 2;
+  TRY_ALLOC_NO_ERR (new_list, CALLOC_ARRAY(cl_event, queue->wait_events_size));
+  memcpy(new_list, queue->wait_events, sizeof(cl_event)*queue->wait_events_num);
+  cl_free(queue->wait_events);
+  queue->wait_events = new_list;
+  queue->wait_events[queue->wait_events_num++] = event;
+  return;
+
+exit:
+  return;
+error:
+  if(queue->wait_events)
+    cl_free(queue->wait_events);
+  queue->wait_events = NULL;
+  queue->wait_events_size = 0;
+  queue->wait_events_num = 0;
+  goto exit;
+
+}
+
+LOCAL void
+cl_command_queue_remove_event(cl_command_queue queue, cl_event event)
+{
+  cl_int i=0;
+
+  assert(queue->wait_events);
+  for(i=0; i<queue->wait_events_num; i++) {
+    if(queue->wait_events[i] == event)
+      break;
+  }
+
+  if(i == queue->wait_events_num)
+    return;
+
+  if(i == queue->wait_events_num - 1) {
+    queue->wait_events[i] = NULL;
+  } else {
+    for(; i<queue->wait_events_num-1; i++) {
+      queue->wait_events[i] = queue->wait_events[i+1];
+    }
+  }
+  queue->wait_events_num -= 1;
+}
+
+#define DEFAULT_WAIT_EVENTS_SIZE  16
+LOCAL void
+cl_command_queue_insert_barrier_event(cl_command_queue queue, cl_event event)
+{
+  cl_int i=0;
+  cl_event *new_list;
+
+  assert(queue != NULL);
+  if(queue->barrier_events == NULL) {
+    queue->barrier_events_size = DEFAULT_WAIT_EVENTS_SIZE;
+    TRY_ALLOC_NO_ERR (queue->barrier_events, CALLOC_ARRAY(cl_event, queue->barrier_events_size));
+  }
+
+  for(i=0; i<queue->barrier_events_num; i++) {
+    if(queue->barrier_events[i] == event)
+      return;   //is in the barrier_events, need to insert
+  }
+
+  if(queue->barrier_events_num < queue->barrier_events_size) {
+    queue->barrier_events[queue->barrier_events_num++] = event;
+    return;
+  }
+
+  //barrier_events_num == barrier_events_size, array is full
+  queue->barrier_events_size *= 2;
+  TRY_ALLOC_NO_ERR (new_list, CALLOC_ARRAY(cl_event, queue->barrier_events_size));
+  memcpy(new_list, queue->barrier_events, sizeof(cl_event)*queue->barrier_events_num);
+  cl_free(queue->barrier_events);
+  queue->barrier_events = new_list;
+  queue->barrier_events[queue->barrier_events_num++] = event;
+  return;
+
+exit:
+  return;
+error:
+  if(queue->barrier_events)
+    cl_free(queue->barrier_events);
+  queue->barrier_events = NULL;
+  queue->barrier_events_size = 0;
+  queue->barrier_events_num = 0;
+  goto exit;
+
+}
+
+LOCAL void
+cl_command_queue_remove_barrier_event(cl_command_queue queue, cl_event event)
+{
+  cl_int i=0;
+
+  if(queue->barrier_events_num == 0)
+    return;
+
+  for(i=0; i<queue->barrier_events_num; i++) {
+    if(queue->barrier_events[i] == event)
+      break;
+  }
+
+  if(i == queue->barrier_events_num)
+    return;
+
+  if(i == queue->barrier_events_num - 1) {
+    queue->barrier_events[i] = NULL;
+  } else {
+    for(; i<queue->barrier_events_num-1; i++) {
+      queue->barrier_events[i] = queue->barrier_events[i+1];
+    }
+  }
+  queue->barrier_events_num -= 1;
+}
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
new file mode 100644
index 0000000..bd70f25
--- /dev/null
+++ b/src/cl_command_queue.h
@@ -0,0 +1,109 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_COMMAND_QUEUE_H__
+#define __CL_COMMAND_QUEUE_H__
+
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "cl_thread.h"
+#include "CL/cl.h"
+#include <stdint.h>
+
+struct intel_gpgpu;
+
+/* Basically, this is a (kind-of) batch buffer */
+struct _cl_command_queue {
+  DEFINE_ICD(dispatch)
+  uint64_t magic;                      /* To identify it as a command queue */
+  volatile int ref_n;                  /* We reference count this object */
+  cl_context ctx;                      /* Its parent context */
+  cl_event* barrier_events;               /* Point to array of non-complete user events that block this command queue */
+  cl_int    barrier_events_num;           /* Number of Non-complete user events */
+  cl_int    barrier_events_size;          /* The size of array that wait_events point to */
+  cl_event* wait_events;               /* Point to array of non-complete user events that block this command queue */
+  cl_int    wait_events_num;           /* Number of Non-complete user events */
+  cl_int    wait_events_size;          /* The size of array that wait_events point to */
+  cl_event  last_event;                /* The last event in the queue, for enqueue mark used */
+  cl_event  current_event;             /* Current event. */
+  cl_command_queue_properties  props;  /* Queue properties */
+  cl_command_queue prev, next;         /* We chain the command queues together */
+  void *thread_data;                   /* Used to store thread context data */
+  cl_mem perf;                         /* Where to put the perf counters */
+  cl_mem fulsim_out;                   /* Fulsim will output this buffer */
+};
+
+/* The macro to get the thread specified gpgpu struct. */
+#define GET_QUEUE_THREAD_GPGPU(queue) \
+	cl_gpgpu gpgpu = queue ? cl_get_thread_gpgpu(queue) : NULL;  \
+	if (queue) \
+	  assert(gpgpu);
+
+/* Allocate and initialize a new command queue. Also insert it in the list of
+ * command queue in the associated context
+ */
+extern cl_command_queue cl_command_queue_new(cl_context);
+
+/* Destroy and deallocate the command queue */
+extern void cl_command_queue_delete(cl_command_queue);
+
+/* Keep one more reference on the queue */
+extern void cl_command_queue_add_ref(cl_command_queue);
+
+/* Map ND range kernel from OCL API */
+extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
+                                        cl_kernel ker,
+                                        const uint32_t work_dim,
+                                        const size_t *global_work_offset,
+                                        const size_t *global_work_size,
+                                        const size_t *local_work_size);
+
+/* The memory object where to report the performance */
+extern cl_int cl_command_queue_set_report_buffer(cl_command_queue, cl_mem);
+
+/* Fulsim will dump this buffer (mostly to check its consistency */
+cl_int cl_command_queue_set_fulsim_buffer(cl_command_queue, cl_mem);
+
+/* Flush for the command queue */
+extern cl_int cl_command_queue_flush(cl_command_queue);
+
+/* Flush for the specified gpgpu */
+extern void cl_command_queue_flush_gpgpu(cl_command_queue, cl_gpgpu);
+
+/* Wait for the completion of the command queue */
+extern cl_int cl_command_queue_finish(cl_command_queue);
+
+/* Bind all the surfaces in the GPGPU state */
+extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel);
+
+/* Bind all the image surfaces in the GPGPU state */
+extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel);
+
+/* Insert a user event to command's wait_events */
+extern void cl_command_queue_insert_event(cl_command_queue, cl_event);
+
+/* Remove a user event from command's wait_events */
+extern void cl_command_queue_remove_event(cl_command_queue, cl_event);
+
+extern void cl_command_queue_insert_barrier_event(cl_command_queue queue, cl_event event);
+
+extern void cl_command_queue_remove_barrier_event(cl_command_queue queue, cl_event event);
+
+#endif /* __CL_COMMAND_QUEUE_H__ */
+
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
new file mode 100644
index 0000000..330f0f9
--- /dev/null
+++ b/src/cl_command_queue_gen7.c
@@ -0,0 +1,394 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_command_queue.h"
+#include "cl_context.h"
+#include "cl_program.h"
+#include "cl_kernel.h"
+#include "cl_device_id.h"
+#include "cl_mem.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#define MAX_GROUP_SIZE_IN_HALFSLICE   512
+static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+128; }
+
+/* "Varing" payload is the part of the curbe that changes accross threads in the
+ *  same work group. Right now, it consists in local IDs and block IPs
+ */
+static cl_int
+cl_set_varying_payload(const cl_kernel ker,
+                       char *data,
+                       const size_t *local_wk_sz,
+                       size_t simd_sz,
+                       size_t cst_sz,
+                       size_t thread_n)
+{
+  uint32_t *ids[3] = {NULL,NULL,NULL};
+  uint16_t *block_ips = NULL;
+  size_t i, j, k, curr = 0;
+  int32_t id_offset[3], ip_offset;
+  cl_int err = CL_SUCCESS;
+
+  id_offset[0] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_X, 0);
+  id_offset[1] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Y, 0);
+  id_offset[2] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Z, 0);
+  ip_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_BLOCK_IP, 0);
+  assert(id_offset[0] >= 0 &&
+         id_offset[1] >= 0 &&
+         id_offset[2] >= 0 &&
+         ip_offset >= 0);
+
+  TRY_ALLOC(ids[0], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
+  TRY_ALLOC(ids[1], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
+  TRY_ALLOC(ids[2], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
+  TRY_ALLOC(block_ips, (uint16_t*) alloca(sizeof(uint16_t)*thread_n*simd_sz));
+
+  /* 0xffff means that the lane is inactivated */
+  memset(block_ips, 0xff, sizeof(uint16_t)*thread_n*simd_sz);
+
+  /* Compute the IDs and the block IPs */
+  for (k = 0; k < local_wk_sz[2]; ++k)
+  for (j = 0; j < local_wk_sz[1]; ++j)
+  for (i = 0; i < local_wk_sz[0]; ++i, ++curr) {
+    ids[0][curr] = i;
+    ids[1][curr] = j;
+    ids[2][curr] = k;
+    block_ips[curr] = 0;
+  }
+
+  /* Copy them to the curbe buffer */
+  curr = 0;
+  for (i = 0; i < thread_n; ++i, data += cst_sz) {
+    uint32_t *ids0 = (uint32_t *) (data + id_offset[0]);
+    uint32_t *ids1 = (uint32_t *) (data + id_offset[1]);
+    uint32_t *ids2 = (uint32_t *) (data + id_offset[2]);
+    uint16_t *ips  = (uint16_t *) (data + ip_offset);
+    for (j = 0; j < simd_sz; ++j, ++curr) {
+      ids0[j] = ids[0][curr];
+      ids1[j] = ids[1][curr];
+      ids2[j] = ids[2][curr];
+      ips[j] = block_ips[curr];
+    }
+  }
+
+error:
+  return err;
+}
+
+static int
+cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
+{
+  /* calculate constant buffer size
+   * we need raw_size & aligned_size
+   */
+  GET_QUEUE_THREAD_GPGPU(queue);
+  int32_t arg;
+  size_t offset = 0;
+  uint32_t raw_size = 0, aligned_size =0;
+  gbe_program prog = ker->program->opaque;
+  const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque);
+  size_t global_const_size = interp_program_get_global_constant_size(prog);
+  aligned_size = raw_size = global_const_size;
+  /* Reserve 8 bytes to get rid of 0 address */
+  if(global_const_size == 0) aligned_size = 8;
+
+  for (arg = 0; arg < arg_n; ++arg) {
+    const enum gbe_arg_type type = interp_kernel_get_arg_type(ker->opaque, arg);
+    if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+      uint32_t alignment = interp_kernel_get_arg_align(ker->opaque, arg);
+      assert(alignment != 0);
+      cl_mem mem = ker->args[arg].mem;
+      raw_size += mem->size;
+      aligned_size = ALIGN(aligned_size, alignment);
+      aligned_size += mem->size;
+    }
+  }
+  if(raw_size == 0)
+     return 0;
+
+  cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, aligned_size, BTI_CONSTANT);
+  if (bo == NULL)
+    return -1;
+  cl_buffer_map(bo, 1);
+  char * cst_addr = cl_buffer_get_virtual(bo);
+  if (cst_addr == NULL)
+    return -1;
+
+  /* upload the global constant data */
+  if (global_const_size > 0) {
+    interp_program_get_global_constant_data(prog, (char*)(cst_addr+offset));
+    offset += global_const_size;
+  }
+
+  /* reserve 8 bytes to get rid of 0 address */
+  if(global_const_size == 0) {
+    offset = 8;
+  }
+
+  /* upload constant buffer argument */
+  int32_t curbe_offset = 0;
+  for (arg = 0; arg < arg_n; ++arg) {
+    const enum gbe_arg_type type = interp_kernel_get_arg_type(ker->opaque, arg);
+    if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
+      cl_mem mem = ker->args[arg].mem;
+      uint32_t alignment = interp_kernel_get_arg_align(ker->opaque, arg);
+      offset = ALIGN(offset, alignment);
+      curbe_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
+      assert(curbe_offset >= 0);
+      *(uint32_t *) (ker->curbe + curbe_offset) = offset;
+
+      cl_buffer_map(mem->bo, 1);
+      void * addr = cl_buffer_get_virtual(mem->bo);
+      memcpy(cst_addr + offset, addr, mem->size);
+      cl_buffer_unmap(mem->bo);
+      offset += mem->size;
+    }
+  }
+  cl_buffer_unmap(bo);
+  return 0;
+}
+
+/* Will return the total amount of slm used */
+static int32_t
+cl_curbe_fill(cl_kernel ker,
+              const uint32_t work_dim,
+              const size_t *global_wk_off,
+              const size_t *global_wk_sz,
+              const size_t *local_wk_sz,
+              size_t thread_n)
+{
+  int32_t offset;
+#define UPLOAD(ENUM, VALUE) \
+  if ((offset = interp_kernel_get_curbe_offset(ker->opaque, ENUM, 0)) >= 0) \
+    *((uint32_t *) (ker->curbe + offset)) = VALUE;
+  UPLOAD(GBE_CURBE_LOCAL_SIZE_X, local_wk_sz[0]);
+  UPLOAD(GBE_CURBE_LOCAL_SIZE_Y, local_wk_sz[1]);
+  UPLOAD(GBE_CURBE_LOCAL_SIZE_Z, local_wk_sz[2]);
+  UPLOAD(GBE_CURBE_GLOBAL_SIZE_X, global_wk_sz[0]);
+  UPLOAD(GBE_CURBE_GLOBAL_SIZE_Y, global_wk_sz[1]);
+  UPLOAD(GBE_CURBE_GLOBAL_SIZE_Z, global_wk_sz[2]);
+  UPLOAD(GBE_CURBE_GLOBAL_OFFSET_X, global_wk_off[0]);
+  UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Y, global_wk_off[1]);
+  UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Z, global_wk_off[2]);
+  UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0]/local_wk_sz[0]);
+  UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]);
+  UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
+  UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
+  UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
+#undef UPLOAD
+
+  /* Write identity for the stack pointer. This is required by the stack pointer
+   * computation in the kernel
+   */
+  if ((offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_POINTER, 0)) >= 0) {
+    const uint32_t simd_sz = interp_kernel_get_simd_width(ker->opaque);
+    uint32_t *stackptr = (uint32_t *) (ker->curbe + offset);
+    int32_t i;
+    for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i;
+  }
+  /* Handle the various offsets to SLM */
+  const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque);
+  int32_t arg, slm_offset = interp_kernel_get_slm_size(ker->opaque);
+  ker->local_mem_sz = 0;
+  for (arg = 0; arg < arg_n; ++arg) {
+    const enum gbe_arg_type type = interp_kernel_get_arg_type(ker->opaque, arg);
+    if (type != GBE_ARG_LOCAL_PTR)
+      continue;
+    uint32_t align = interp_kernel_get_arg_align(ker->opaque, arg);
+    assert(align != 0);
+    slm_offset = ALIGN(slm_offset, align);
+    offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
+    assert(offset >= 0);
+    uint32_t *slmptr = (uint32_t *) (ker->curbe + offset);
+    *slmptr = slm_offset;
+    slm_offset += ker->args[arg].local_sz;
+    ker->local_mem_sz += ker->args[arg].local_sz;
+  }
+  return slm_offset;
+}
+
+static void
+cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
+{
+  cl_context ctx = ker->program->ctx;
+  cl_device_id device = ctx->device;
+  const int32_t per_lane_stack_sz = ker->stack_size;
+  const int32_t value = GBE_CURBE_EXTRA_ARGUMENT;
+  const int32_t sub_value = GBE_STACK_BUFFER;
+  const int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value);
+  int32_t stack_sz = per_lane_stack_sz;
+
+  /* No stack required for this kernel */
+  if (per_lane_stack_sz == 0)
+    return;
+
+  /* The stack size is given for *each* SIMD lane. So, we accordingly compute
+   * the size we need for the complete machine
+   */
+  assert(offset >= 0);
+  stack_sz *= interp_kernel_get_simd_width(ker->opaque);
+  stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit;
+  /* Because HSW calc stack offset per thread is relative with half slice, when
+     thread schedule in half slice is not balance, would out of bound. Because
+     the max half slice is 4 in GT4, multiply stack size with 4 for safe.
+   */
+  if(cl_driver_get_ver(ctx->drv) == 75)
+    stack_sz *= 4;
+  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE);
+}
+
+static int
+cl_bind_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num, size_t global_sz) {
+  int32_t value = GBE_CURBE_PRINTF_INDEX_POINTER;
+  int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
+  size_t buf_size = global_sz * sizeof(int) * printf_num;
+  if (offset > 0) {
+    if (cl_gpgpu_set_printf_buffer(gpgpu, 0, buf_size, offset, interp_get_printf_indexbuf_bti(printf_info)) != 0)
+      return -1;
+  }
+
+  value = GBE_CURBE_PRINTF_BUF_POINTER;
+  offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
+  buf_size = interp_get_printf_sizeof_size(printf_info) * global_sz;
+  if (offset > 0) {
+    if (cl_gpgpu_set_printf_buffer(gpgpu, 1, buf_size, offset, interp_get_printf_buf_bti(printf_info)) != 0)
+      return -1;
+  }
+  return 0;
+}
+
+LOCAL cl_int
+cl_command_queue_ND_range_gen7(cl_command_queue queue,
+                               cl_kernel ker,
+                               const uint32_t work_dim,
+                               const size_t *global_wk_off,
+                               const size_t *global_wk_sz,
+                               const size_t *local_wk_sz)
+{
+  GET_QUEUE_THREAD_GPGPU(queue);
+  cl_context ctx = queue->ctx;
+  char *final_curbe = NULL;  /* Includes them and one sub-buffer per group */
+  cl_gpgpu_kernel kernel;
+  const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
+  size_t i, batch_sz = 0u, local_sz = 0u;
+  size_t cst_sz = ker->curbe_sz= interp_kernel_get_curbe_size(ker->opaque);
+  int32_t scratch_sz = interp_kernel_get_scratch_size(ker->opaque);
+  size_t thread_n = 0u;
+  int printf_num = 0;
+  cl_int err = CL_SUCCESS;
+  size_t global_size = global_wk_sz[0] * global_wk_sz[1] * global_wk_sz[2];
+  void* printf_info = NULL;
+
+  /* Setup kernel */
+  kernel.name = "KERNEL";
+  kernel.grf_blocks = 128;
+  kernel.bo = ker->bo;
+  kernel.barrierID = 0;
+  kernel.slm_sz = 0;
+  kernel.use_slm = interp_kernel_use_slm(ker->opaque);
+
+  /* Compute the number of HW threads we need */
+  TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
+  kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
+  kernel.curbe_sz = cst_sz;
+
+  if (scratch_sz > ker->program->ctx->device->scratch_mem_size) {
+    fprintf(stderr, "Beignet: Out of scratch memory %d.\n", scratch_sz);
+    return CL_OUT_OF_RESOURCES;
+  }
+  /* Curbe step 1: fill the constant urb buffer data shared by all threads */
+  if (ker->curbe) {
+    kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
+    if (kernel.slm_sz > ker->program->ctx->device->local_mem_size) {
+      fprintf(stderr, "Beignet: Out of shared local memory %d.\n", kernel.slm_sz);
+      return CL_OUT_OF_RESOURCES;
+    }
+  }
+
+  printf_info = interp_dup_printfset(ker->opaque);
+  cl_gpgpu_set_printf_info(gpgpu, printf_info, (size_t *)global_wk_sz);
+
+  /* Setup the kernel */
+  if (queue->props & CL_QUEUE_PROFILING_ENABLE)
+    err = cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 1);
+  else
+    err = cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit * ctx->device->max_thread_per_unit, cst_sz / 32, 0);
+  if (err != 0)
+    goto error;
+  printf_num = interp_get_printf_num(printf_info);
+  if (printf_num) {
+    if (cl_bind_printf(gpgpu, ker, printf_info, printf_num, global_size) != 0)
+      goto error;
+  }
+
+  /* Bind user buffers */
+  cl_command_queue_bind_surface(queue, ker);
+  /* Bind user images */
+  cl_command_queue_bind_image(queue, ker);
+  /* Bind all samplers */
+  cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
+
+  if (cl_gpgpu_set_scratch(gpgpu, scratch_sz) != 0)
+    goto error;
+
+  /* Bind a stack if needed */
+  cl_bind_stack(gpgpu, ker);
+
+  if (cl_upload_constant_buffer(queue, ker) != 0)
+    goto error;
+
+  cl_gpgpu_states_setup(gpgpu, &kernel);
+
+  /* Curbe step 2. Give the localID and upload it to video memory */
+  if (ker->curbe) {
+    assert(cst_sz > 0);
+    TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
+    for (i = 0; i < thread_n; ++i) {
+        memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
+    }
+    TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n);
+    if (cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz) != 0)
+      goto error;
+  }
+
+  /* Start a new batch buffer */
+  batch_sz = cl_kernel_compute_batch_sz(ker);
+  if (cl_gpgpu_batch_reset(gpgpu, batch_sz) != 0)
+    goto error;
+  cl_set_thread_batch_buf(queue, cl_gpgpu_ref_batch_buf(gpgpu));
+  cl_gpgpu_batch_start(gpgpu);
+
+  /* Issue the GPGPU_WALKER command */
+  cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz);
+
+  /* Close the batch buffer and submit it */
+  cl_gpgpu_batch_end(gpgpu, 0);
+  return CL_SUCCESS;
+
+error:
+  fprintf(stderr, "error occured. \n");
+  exit(-1);
+  return CL_OUT_OF_RESOURCES;
+}
+
diff --git a/src/cl_context.c b/src/cl_context.c
new file mode 100644
index 0000000..152faf3
--- /dev/null
+++ b/src/cl_context.c
@@ -0,0 +1,372 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_platform_id.h"
+#include "cl_device_id.h"
+#include "cl_context.h"
+#include "cl_command_queue.h"
+#include "cl_mem.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+#include "cl_driver.h"
+#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+#include "cl_program.h"
+
+#include "CL/cl.h"
+#include "CL/cl_gl.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>
+
+#define CHECK(var) \
+  if (var) \
+    return CL_INVALID_PROPERTY; \
+  else \
+    var = 1;
+
+static cl_int
+cl_context_properties_process(const cl_context_properties *prop,
+                              struct _cl_context_prop *cl_props, cl_uint * prop_len)
+{
+  int set_cl_context_platform = 0,
+      set_cl_gl_context_khr = 0,
+      set_cl_egl_display_khr = 0,
+      set_cl_glx_display_khr = 0,
+      set_cl_wgl_hdc_khr = 0,
+      set_cl_cgl_sharegroup_khr = 0;
+  cl_int err = CL_SUCCESS;
+
+  cl_props->gl_type = CL_GL_NOSHARE;
+  cl_props->platform_id = 0;
+
+  if (prop == NULL)
+    goto exit;
+
+
+  while(*prop) {
+    switch (*prop) {
+    case CL_CONTEXT_PLATFORM:
+      CHECK (set_cl_context_platform);
+      cl_props->platform_id = *(prop + 1);
+      if (UNLIKELY((cl_platform_id) cl_props->platform_id != intel_platform)) {
+        err = CL_INVALID_PLATFORM;
+        goto error;
+      }
+      break;
+    case CL_GL_CONTEXT_KHR:
+      CHECK (set_cl_gl_context_khr);
+      cl_props->gl_context = *(prop + 1);
+      break;
+    case CL_EGL_DISPLAY_KHR:
+      CHECK (set_cl_egl_display_khr);
+      cl_props->gl_type = CL_GL_EGL_DISPLAY;
+      cl_props->egl_display = *(prop + 1);
+      break;
+    case CL_GLX_DISPLAY_KHR:
+      CHECK (set_cl_glx_display_khr);
+      cl_props->gl_type = CL_GL_GLX_DISPLAY;
+      cl_props->glx_display = *(prop + 1);
+      break;
+    case CL_WGL_HDC_KHR:
+      CHECK (set_cl_wgl_hdc_khr);
+      cl_props->gl_type = CL_GL_WGL_HDC;
+      cl_props->wgl_hdc = *(prop + 1);
+      break;
+    case CL_CGL_SHAREGROUP_KHR:
+      CHECK (set_cl_cgl_sharegroup_khr);
+      cl_props->gl_type = CL_GL_CGL_SHAREGROUP;
+      cl_props->cgl_sharegroup = *(prop + 1);
+      break;
+    default:
+      err = CL_INVALID_PROPERTY;
+      goto error;
+    }
+    prop += 2;
+    *prop_len += 2;
+  }
+  (*prop_len)++;
+exit:
+error:
+  return err;
+}
+
+
+
+LOCAL cl_context
+cl_create_context(const cl_context_properties *  properties,
+                  cl_uint                        num_devices,
+                  const cl_device_id *           devices,
+                  void (CL_CALLBACK * pfn_notify) (const char*, const void*, size_t, void*),
+                  void *                         user_data,
+                  cl_int *                       errcode_ret)
+{
+  /* cl_platform_id platform = NULL; */
+  struct _cl_context_prop props;
+  cl_context ctx = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_uint prop_len = 0;
+  /* XXX */
+  FATAL_IF (num_devices != 1, "Only one device is supported");
+
+  /* Check that we are getting the right platform */
+  if (UNLIKELY(((err = cl_context_properties_process(properties, &props, &prop_len)) != CL_SUCCESS)))
+    goto error;
+
+  /* We are good */
+  if (UNLIKELY((ctx = cl_context_new(&props)) == NULL)) {
+    err = CL_OUT_OF_HOST_MEMORY;
+    goto error;
+  }
+
+  if(properties != NULL && prop_len > 0) {
+    TRY_ALLOC (ctx->prop_user, CALLOC_ARRAY(cl_context_properties, prop_len));
+    memcpy(ctx->prop_user, properties, sizeof(cl_context_properties)*prop_len);
+  }
+  ctx->prop_len = prop_len;
+  /* Attach the device to the context */
+  ctx->device = *devices;
+
+  /* Save the user callback and user data*/
+  ctx->pfn_notify = pfn_notify;
+  ctx->user_data = user_data;
+
+exit:
+  if (errcode_ret != NULL)
+    *errcode_ret = err;
+  return ctx;
+error:
+  cl_context_delete(ctx);
+  ctx = NULL;
+  goto exit;
+}
+
+LOCAL cl_context
+cl_context_new(struct _cl_context_prop *props)
+{
+  cl_context ctx = NULL;
+
+  TRY_ALLOC_NO_ERR (ctx, CALLOC(struct _cl_context));
+  TRY_ALLOC_NO_ERR (ctx->drv, cl_driver_new(props));
+  SET_ICD(ctx->dispatch)
+  ctx->props = *props;
+  ctx->magic = CL_MAGIC_CONTEXT_HEADER;
+  ctx->ref_n = 1;
+  ctx->ver = cl_driver_get_ver(ctx->drv);
+  pthread_mutex_init(&ctx->program_lock, NULL);
+  pthread_mutex_init(&ctx->queue_lock, NULL);
+  pthread_mutex_init(&ctx->buffer_lock, NULL);
+  pthread_mutex_init(&ctx->sampler_lock, NULL);
+
+exit:
+  return ctx;
+error:
+  cl_context_delete(ctx);
+  ctx = NULL;
+  goto exit;
+}
+
+LOCAL void
+cl_context_delete(cl_context ctx)
+{
+  int i = 0;
+  if (UNLIKELY(ctx == NULL))
+    return;
+
+  /* We are not done yet */
+  if (atomic_dec(&ctx->ref_n) > 1)
+    return;
+
+  /* delete the internal programs. */
+  for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
+    if (ctx->internel_kernels[i]) {
+      cl_kernel_delete(ctx->internel_kernels[i]);
+      ctx->internel_kernels[i] = NULL;
+
+      assert(ctx->internal_prgs[i]);
+      cl_program_delete(ctx->internal_prgs[i]);
+      ctx->internal_prgs[i] = NULL;
+    }
+
+    if (ctx->internel_kernels[i]) {
+      cl_kernel_delete(ctx->built_in_kernels[i]);
+      ctx->built_in_kernels[i] = NULL;
+    }
+  }
+
+  cl_program_delete(ctx->built_in_prgs);
+  ctx->built_in_prgs = NULL;
+
+  /* All object lists should have been freed. Otherwise, the reference counter
+   * of the context cannot be 0
+   */
+  assert(ctx->queues == NULL);
+  assert(ctx->programs == NULL);
+  assert(ctx->buffers == NULL);
+  assert(ctx->drv);
+  cl_free(ctx->prop_user);
+  cl_driver_delete(ctx->drv);
+  ctx->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+  cl_free(ctx);
+}
+
+LOCAL void
+cl_context_add_ref(cl_context ctx)
+{
+  assert(ctx);
+  atomic_inc(&ctx->ref_n);
+}
+
+LOCAL cl_command_queue
+cl_context_create_queue(cl_context ctx,
+                        cl_device_id device,
+                        cl_command_queue_properties properties, /* XXX */
+                        cl_int *errcode_ret)
+{
+  cl_command_queue queue = NULL;
+  cl_int err = CL_SUCCESS;
+
+
+
+  /* We create the command queue and store it in the context list of queues */
+  TRY_ALLOC (queue, cl_command_queue_new(ctx));
+  queue->props = properties;
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return queue;
+error:
+  cl_command_queue_delete(queue);
+  queue = NULL;
+  goto exit;
+}
+
+cl_buffer_mgr
+cl_context_get_bufmgr(cl_context ctx)
+{
+  return cl_driver_get_bufmgr(ctx->drv);
+}
+
+cl_kernel
+cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kernel, const char * str_option)
+{
+  cl_int ret;
+  if (!ctx->internal_prgs[index]) {
+    size_t length = strlen(str_kernel) + 1;
+    ctx->internal_prgs[index] = cl_program_create_from_source(ctx, 1, &str_kernel, &length, NULL);
+
+    if (!ctx->internal_prgs[index])
+      return NULL;
+
+    ret = cl_program_build(ctx->internal_prgs[index], str_option);
+    if (ret != CL_SUCCESS)
+      return NULL;
+
+    ctx->internal_prgs[index]->is_built = 1;
+
+    /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
+    if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+      int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
+      for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
+        if (index != i) {
+          assert(ctx->internal_prgs[i] == NULL);
+          assert(ctx->internel_kernels[i] == NULL);
+          cl_program_add_ref(ctx->internal_prgs[index]);
+          ctx->internal_prgs[i] = ctx->internal_prgs[index];
+        }
+
+        if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_2", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_4", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_8", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_16", NULL);
+        } else
+          assert(0);
+      }
+    } else {
+      ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+    }
+  }
+
+  return ctx->internel_kernels[index];
+}
+
+cl_kernel
+cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
+                  const char * str_kernel, size_t size, const char * str_option)
+{
+  cl_int ret;
+  cl_int binary_status = CL_SUCCESS;
+  if (!ctx->internal_prgs[index]) {
+    ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
+      &size, (const unsigned char **)&str_kernel, &binary_status, &ret);
+
+    if (!ctx->internal_prgs[index])
+      return NULL;
+
+    ret = cl_program_build(ctx->internal_prgs[index], str_option);
+    if (ret != CL_SUCCESS)
+      return NULL;
+
+    ctx->internal_prgs[index]->is_built = 1;
+
+    /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
+    if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+      int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
+      for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
+        if (index != i) {
+          assert(ctx->internal_prgs[i] == NULL);
+          assert(ctx->internel_kernels[i] == NULL);
+          cl_program_add_ref(ctx->internal_prgs[index]);
+          ctx->internal_prgs[i] = ctx->internal_prgs[index];
+        }
+
+        if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_2", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_4", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_8", NULL);
+        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+          ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                              "__cl_fill_region_align8_16", NULL);
+        } else
+          assert(0);
+      }
+    } else {
+      ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+    }
+  }
+
+  return ctx->internel_kernels[index];
+}
diff --git a/src/cl_context.h b/src/cl_context.h
new file mode 100644
index 0000000..75afbf6
--- /dev/null
+++ b/src/cl_context.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_CONTEXT_H__
+#define __CL_CONTEXT_H__
+
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "CL/cl.h"
+#include "cl_khr_icd.h"
+
+#include <stdint.h>
+#include <pthread.h>
+
+/* DRI device created at create context */
+struct intel_driver;
+
+enum _cl_gl_context_type {
+  CL_GL_NOSHARE,
+  CL_GL_EGL_DISPLAY,
+  CL_GL_GLX_DISPLAY,
+  CL_GL_WGL_HDC,
+  CL_GL_CGL_SHAREGROUP
+};
+
+enum _cl_internal_ker_type {
+  CL_INTERNAL_KERNEL_MIN = 0,
+  CL_ENQUEUE_COPY_BUFFER_ALIGN4 = 0,
+  CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+  CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET,
+  CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
+  CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
+  CL_ENQUEUE_COPY_BUFFER_RECT,
+  CL_ENQUEUE_COPY_IMAGE_1D_TO_1D,             //copy image 1d to image 1d
+  CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,             //copy image 2d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,             //copy image 3d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_2D_TO_3D,             //copy image 2d to image 3d
+  CL_ENQUEUE_COPY_IMAGE_3D_TO_3D,             //copy image 3d to image 3d
+  CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,   //copy image 2d to buffer
+  CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
+  CL_ENQUEUE_FILL_BUFFER_UNALIGN,      //fill buffer with 1 aligne pattern, pattern size=1
+  CL_ENQUEUE_FILL_BUFFER_ALIGN2,       //fill buffer with 2 aligne pattern, pattern size=2
+  CL_ENQUEUE_FILL_BUFFER_ALIGN4,       //fill buffer with 4 aligne pattern, pattern size=4
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_8,     //fill buffer with 8 aligne pattern, pattern size=8
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_16,    //fill buffer with 16 aligne pattern, pattern size=16
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_32,    //fill buffer with 16 aligne pattern, pattern size=32
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_64,    //fill buffer with 16 aligne pattern, pattern size=64
+  CL_ENQUEUE_FILL_BUFFER_ALIGN128,     //fill buffer with 128 aligne pattern, pattern size=128
+  CL_ENQUEUE_FILL_IMAGE_1D,             //fill image 1d
+  CL_ENQUEUE_FILL_IMAGE_1D_ARRAY,       //fill image 1d array
+  CL_ENQUEUE_FILL_IMAGE_2D,             //fill image 2d
+  CL_ENQUEUE_FILL_IMAGE_2D_ARRAY,       //fill image 2d array
+  CL_ENQUEUE_FILL_IMAGE_3D,             //fill image 3d
+  CL_INTERNAL_KERNEL_MAX
+};
+
+struct _cl_context_prop {
+  cl_context_properties platform_id;
+  enum _cl_gl_context_type gl_type;
+  cl_context_properties gl_context;
+  union {
+    cl_context_properties egl_display;
+    cl_context_properties glx_display;
+    cl_context_properties wgl_hdc;
+    cl_context_properties cgl_sharegroup;
+  };
+};
+
+#define IS_EGL_CONTEXT(ctx)  (ctx->props.gl_type == CL_GL_EGL_DISPLAY)
+#define EGL_DISP(ctx)   (EGLDisplay)(ctx->props.egl_display)
+#define EGL_CTX(ctx)    (EGLContext)(ctx->props.gl_context)
+/* Encapsulate the whole device */
+struct _cl_context {
+  DEFINE_ICD(dispatch)
+  uint64_t magic;                   /* To identify it as a context */
+  volatile int ref_n;               /* We reference count this object */
+  cl_driver drv;                    /* Handles HW or simulator */
+  cl_device_id device;              /* All information about the GPU device */
+  cl_command_queue queues;          /* All command queues currently allocated */
+  cl_program programs;              /* All programs currently allocated */
+  cl_mem buffers;                   /* All memory object currently allocated */
+  cl_sampler samplers;              /* All sampler object currently allocated */
+  cl_event   events;                /* All event object currently allocated */
+  pthread_mutex_t queue_lock;       /* To allocate and deallocate queues */
+  pthread_mutex_t program_lock;     /* To allocate and deallocate programs */
+  pthread_mutex_t buffer_lock;      /* To allocate and deallocate buffers */
+  pthread_mutex_t sampler_lock;     /* To allocate and deallocate samplers */
+  pthread_mutex_t event_lock;       /* To allocate and deallocate events */
+  cl_program internal_prgs[CL_INTERNAL_KERNEL_MAX];
+                                    /* All programs internal used, for example clEnqueuexxx api use */
+  cl_kernel  internel_kernels[CL_INTERNAL_KERNEL_MAX];
+                                    /* All kernels  for clenqueuexxx api, for example clEnqueuexxx api use */
+  cl_program built_in_prgs;  /*all built-in kernels belongs to this program only*/
+  cl_kernel  built_in_kernels[CL_INTERNAL_KERNEL_MAX];
+  uint32_t ver;                     /* Gen version */
+  struct _cl_context_prop props;
+  cl_context_properties * prop_user; /* a copy of user passed context properties when create context */
+  cl_uint                 prop_len;  /* count of the properties */
+  void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *);
+                                     /* User's callback when error occur in context */
+  void *user_data;                   /* A pointer to user supplied data */
+
+};
+
+/* Implement OpenCL function */
+extern cl_context cl_create_context(const cl_context_properties*,
+                                    cl_uint,
+                                    const cl_device_id*,
+                                    void (CL_CALLBACK * pfn_notify) (const char*, const void*, size_t, void*),
+                                    void *,
+                                    cl_int*);
+
+/* Allocate and initialize a context */
+extern cl_context cl_context_new(struct _cl_context_prop *);
+
+/* Destroy and deallocate a context */
+extern void cl_context_delete(cl_context);
+
+/* Increment the context reference counter */
+extern void cl_context_add_ref(cl_context);
+
+/* Create the command queue from the given context and device */
+extern cl_command_queue cl_context_create_queue(cl_context,
+                                                cl_device_id,
+                                                cl_command_queue_properties,
+                                                cl_int*);
+
+/* Enqueue a ND Range kernel */
+extern cl_int cl_context_ND_kernel(cl_context,
+                                   cl_command_queue,
+                                   cl_kernel,
+                                   cl_uint,
+                                   const size_t*,
+                                   const size_t*,
+                                   const size_t*);
+
+/* Used for allocation */
+extern cl_buffer_mgr cl_context_get_bufmgr(cl_context ctx);
+
+/* Get the internal used kernel */
+extern cl_kernel cl_context_get_static_kernel(cl_context ctx, cl_int index, const char *str_kernel, const char * str_option);
+
+/* Get the internal used kernel from binary*/
+extern cl_kernel cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
+                  const char * str_kernel, size_t size, const char * str_option);
+
+#endif /* __CL_CONTEXT_H__ */
+
diff --git a/src/cl_device_data.h b/src/cl_device_data.h
new file mode 100644
index 0000000..28bd5f0
--- /dev/null
+++ b/src/cl_device_data.h
@@ -0,0 +1,194 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_DEVICE_DATA_H__
+#define __CL_DEVICE_DATA_H__
+
+#define INVALID_CHIP_ID -1 //returned by intel_get_device_id if no device found
+
+#define PCI_CHIP_GM45_GM                0x2A42
+#define PCI_CHIP_IGD_E_G                0x2E02
+#define PCI_CHIP_Q45_G                  0x2E12
+#define PCI_CHIP_G45_G                  0x2E22
+#define PCI_CHIP_G41_G                  0x2E32
+
+#define PCI_CHIP_IGDNG_D_G              0x0042
+#define PCI_CHIP_IGDNG_M_G              0x0046
+
+#define IS_G45(devid)           (devid == PCI_CHIP_IGD_E_G || \
+    devid == PCI_CHIP_Q45_G || \
+    devid == PCI_CHIP_G45_G || \
+    devid == PCI_CHIP_G41_G)
+#define IS_GM45(devid)          (devid == PCI_CHIP_GM45_GM)
+#define IS_G4X(devid)    (IS_G45(devid) || IS_GM45(devid))
+
+#define IS_IGDNG_D(devid)       (devid == PCI_CHIP_IGDNG_D_G)
+#define IS_IGDNG_M(devid)       (devid == PCI_CHIP_IGDNG_M_G)
+#define IS_IGDNG(devid)         (IS_IGDNG_D(devid) || IS_IGDNG_M(devid))
+
+#ifndef PCI_CHIP_SANDYBRIDGE_BRIDGE
+#define PCI_CHIP_SANDYBRIDGE_BRIDGE      0x0100  /* Desktop */
+#define PCI_CHIP_SANDYBRIDGE_GT1         0x0102
+#define PCI_CHIP_SANDYBRIDGE_GT2         0x0112
+#define PCI_CHIP_SANDYBRIDGE_GT2_PLUS    0x0122
+#define PCI_CHIP_SANDYBRIDGE_BRIDGE_M    0x0104  /* Mobile */
+#define PCI_CHIP_SANDYBRIDGE_M_GT1       0x0106
+#define PCI_CHIP_SANDYBRIDGE_M_GT2       0x0116
+#define PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS  0x0126
+#define PCI_CHIP_SANDYBRIDGE_BRIDGE_S  0x0108  /* Server */
+#define PCI_CHIP_SANDYBRIDGE_S_GT  0x010A
+#endif
+
+#define IS_GEN6(devid)                          \
+   (devid == PCI_CHIP_SANDYBRIDGE_GT1 ||        \
+    devid == PCI_CHIP_SANDYBRIDGE_GT2 ||        \
+    devid == PCI_CHIP_SANDYBRIDGE_GT2_PLUS ||   \
+    devid == PCI_CHIP_SANDYBRIDGE_M_GT1 ||      \
+    devid == PCI_CHIP_SANDYBRIDGE_M_GT2 ||      \
+    devid == PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS || \
+    devid == PCI_CHIP_SANDYBRIDGE_S_GT)
+
+#define PCI_CHIP_IVYBRIDGE_GT1          0x0152  /* Desktop */
+#define PCI_CHIP_IVYBRIDGE_GT2          0x0162
+#define PCI_CHIP_IVYBRIDGE_M_GT1        0x0156  /* Mobile */
+#define PCI_CHIP_IVYBRIDGE_M_GT2        0x0166
+#define PCI_CHIP_IVYBRIDGE_S_GT1        0x015a  /* Server */
+#define PCI_CHIP_IVYBRIDGE_S_GT2        0x016a
+
+#define PCI_CHIP_BAYTRAIL_T 0x0F31
+
+#define IS_IVB_GT1(devid)               \
+  (devid == PCI_CHIP_IVYBRIDGE_GT1 ||   \
+   devid == PCI_CHIP_IVYBRIDGE_M_GT1 || \
+   devid == PCI_CHIP_IVYBRIDGE_S_GT1)
+
+#define IS_IVB_GT2(devid)               \
+  (devid == PCI_CHIP_IVYBRIDGE_GT2 ||   \
+   devid == PCI_CHIP_IVYBRIDGE_M_GT2 || \
+   devid == PCI_CHIP_IVYBRIDGE_S_GT2)
+
+#define IS_BAYTRAIL_T(devid)              \
+  (devid == PCI_CHIP_BAYTRAIL_T)
+
+#define IS_IVYBRIDGE(devid) (IS_IVB_GT1(devid) || IS_IVB_GT2(devid) || IS_BAYTRAIL_T(devid))
+#define IS_GEN7(devid)      IS_IVYBRIDGE(devid)
+
+
+#define PCI_CHIP_HASWELL_D1          0x0402 /* GT1 desktop */
+#define PCI_CHIP_HASWELL_D2          0x0412 /* GT2 desktop */
+#define PCI_CHIP_HASWELL_D3          0x0422 /* GT3 desktop */
+#define PCI_CHIP_HASWELL_S1          0x040a /* GT1 server */
+#define PCI_CHIP_HASWELL_S2          0x041a /* GT2 server */
+#define PCI_CHIP_HASWELL_S3          0x042a /* GT3 server */
+#define PCI_CHIP_HASWELL_M1          0x0406 /* GT1 mobile */
+#define PCI_CHIP_HASWELL_M2          0x0416 /* GT2 mobile */
+#define PCI_CHIP_HASWELL_M3          0x0426 /* GT3 mobile */
+#define PCI_CHIP_HASWELL_B1          0x040B /* Haswell GT1 */
+#define PCI_CHIP_HASWELL_B2          0x041B /* Haswell GT2 */
+#define PCI_CHIP_HASWELL_B3          0x042B /* Haswell GT3 */
+#define PCI_CHIP_HASWELL_E1          0x040E /* Haswell GT1 */
+#define PCI_CHIP_HASWELL_E2          0x041E /* Haswell GT2 */
+#define PCI_CHIP_HASWELL_E3          0x042E /* Haswell GT3 */
+
+/* Software Development Vehicle devices. */
+#define PCI_CHIP_HASWELL_SDV_D1      0x0C02 /* SDV GT1 desktop */
+#define PCI_CHIP_HASWELL_SDV_D2      0x0C12 /* SDV GT2 desktop */
+#define PCI_CHIP_HASWELL_SDV_D3      0x0C22 /* SDV GT3 desktop */
+#define PCI_CHIP_HASWELL_SDV_S1      0x0C0A /* SDV GT1 server */
+#define PCI_CHIP_HASWELL_SDV_S2      0x0C1A /* SDV GT2 server */
+#define PCI_CHIP_HASWELL_SDV_S3      0x0C2A /* SDV GT3 server */
+#define PCI_CHIP_HASWELL_SDV_M1      0x0C06 /* SDV GT1 mobile */
+#define PCI_CHIP_HASWELL_SDV_M2      0x0C16 /* SDV GT2 mobile */
+#define PCI_CHIP_HASWELL_SDV_M3      0x0C26 /* SDV GT3 mobile */
+#define PCI_CHIP_HASWELL_SDV_B1      0x0C0B /* SDV GT1 */
+#define PCI_CHIP_HASWELL_SDV_B2      0x0C1B /* SDV GT2 */
+#define PCI_CHIP_HASWELL_SDV_B3      0x0C2B /* SDV GT3 */
+#define PCI_CHIP_HASWELL_SDV_E1      0x0C0E /* SDV GT1 */
+#define PCI_CHIP_HASWELL_SDV_E2      0x0C1E /* SDV GT2 */
+#define PCI_CHIP_HASWELL_SDV_E3      0x0C2E /* SDV GT3 */
+/* Ultrabooks */
+#define PCI_CHIP_HASWELL_ULT_D1      0x0A02 /* ULT GT1 desktop */
+#define PCI_CHIP_HASWELL_ULT_D2      0x0A12 /* ULT GT2 desktop */
+#define PCI_CHIP_HASWELL_ULT_D3      0x0A22 /* ULT GT3 desktop */
+#define PCI_CHIP_HASWELL_ULT_S1      0x0A0A /* ULT GT1 server */
+#define PCI_CHIP_HASWELL_ULT_S2      0x0A1A /* ULT GT2 server */
+#define PCI_CHIP_HASWELL_ULT_S3      0x0A2A /* ULT GT3 server */
+#define PCI_CHIP_HASWELL_ULT_M1      0x0A06 /* ULT GT1 mobile */
+#define PCI_CHIP_HASWELL_ULT_M2      0x0A16 /* ULT GT2 mobile */
+#define PCI_CHIP_HASWELL_ULT_M3      0x0A26 /* ULT GT3 mobile */
+#define PCI_CHIP_HASWELL_ULT_B1      0x0A0B /* ULT GT1 */
+#define PCI_CHIP_HASWELL_ULT_B2      0x0A1B /* ULT GT2 */
+#define PCI_CHIP_HASWELL_ULT_B3      0x0A2B /* ULT GT3 */
+#define PCI_CHIP_HASWELL_ULT_E1      0x0A0E /* ULT GT1 */
+#define PCI_CHIP_HASWELL_ULT_E2      0x0A1E /* ULT GT2 */
+#define PCI_CHIP_HASWELL_ULT_E3      0x0A2E /* ULT GT3 */
+/* CRW */
+#define PCI_CHIP_HASWELL_CRW_D1      0x0D02 /* CRW GT1 desktop */
+#define PCI_CHIP_HASWELL_CRW_D2      0x0D12 /* CRW GT2 desktop */
+#define PCI_CHIP_HASWELL_CRW_D3      0x0D22 /* CRW GT3 desktop */
+#define PCI_CHIP_HASWELL_CRW_S1      0x0D0A /* CRW GT1 server */
+#define PCI_CHIP_HASWELL_CRW_S2      0x0D1A /* CRW GT2 server */
+#define PCI_CHIP_HASWELL_CRW_S3      0x0D2A /* CRW GT3 server */
+#define PCI_CHIP_HASWELL_CRW_M1      0x0D06 /* CRW GT1 mobile */
+#define PCI_CHIP_HASWELL_CRW_M2      0x0D16 /* CRW GT2 mobile */
+#define PCI_CHIP_HASWELL_CRW_M3      0x0D26 /* CRW GT3 mobile */
+#define PCI_CHIP_HASWELL_CRW_B1      0x0D0B /* CRW GT1 */
+#define PCI_CHIP_HASWELL_CRW_B2      0x0D1B /* CRW GT2 */
+#define PCI_CHIP_HASWELL_CRW_B3      0x0D2B /* CRW GT3 */
+#define PCI_CHIP_HASWELL_CRW_E1      0x0D0E /* CRW GT1 */
+#define PCI_CHIP_HASWELL_CRW_E2      0x0D1E /* CRW GT2 */
+#define PCI_CHIP_HASWELL_CRW_E3      0x0D2E /* CRW GT3 */
+
+
+#define IS_HASWELL(devid) (  \
+	(devid) == PCI_CHIP_HASWELL_D1 || (devid) == PCI_CHIP_HASWELL_D2 || \
+	(devid) == PCI_CHIP_HASWELL_D3 || (devid) == PCI_CHIP_HASWELL_S1 || \
+	(devid) == PCI_CHIP_HASWELL_S2 || (devid) == PCI_CHIP_HASWELL_S3 || \
+	(devid) == PCI_CHIP_HASWELL_M1 || (devid) == PCI_CHIP_HASWELL_M2 || \
+	(devid) == PCI_CHIP_HASWELL_M3 || (devid) == PCI_CHIP_HASWELL_B1 || \
+	(devid) == PCI_CHIP_HASWELL_B2 || (devid) == PCI_CHIP_HASWELL_B3 || \
+	(devid) == PCI_CHIP_HASWELL_E1 || (devid) == PCI_CHIP_HASWELL_E2 || \
+	(devid) == PCI_CHIP_HASWELL_E3 || (devid) == PCI_CHIP_HASWELL_SDV_D1 || \
+	(devid) == PCI_CHIP_HASWELL_SDV_D2 || (devid) == PCI_CHIP_HASWELL_SDV_D3 || \
+	(devid) == PCI_CHIP_HASWELL_SDV_S1 || (devid) == PCI_CHIP_HASWELL_SDV_S2 || \
+	(devid) == PCI_CHIP_HASWELL_SDV_S3 || (devid) == PCI_CHIP_HASWELL_SDV_M1 || \
+	(devid) == PCI_CHIP_HASWELL_SDV_M2 || (devid) == PCI_CHIP_HASWELL_SDV_M3 || \
+	(devid) == PCI_CHIP_HASWELL_SDV_B1 || (devid) == PCI_CHIP_HASWELL_SDV_B2 || \
+	(devid) == PCI_CHIP_HASWELL_SDV_B3 || (devid) == PCI_CHIP_HASWELL_SDV_E1 || \
+	(devid) == PCI_CHIP_HASWELL_SDV_E2 || (devid) == PCI_CHIP_HASWELL_SDV_E3 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_D1 || (devid) == PCI_CHIP_HASWELL_ULT_D2 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_D3 || (devid) == PCI_CHIP_HASWELL_ULT_S1 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_S2 || (devid) == PCI_CHIP_HASWELL_ULT_S3 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_M1 || (devid) == PCI_CHIP_HASWELL_ULT_M2 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_M3 || (devid) == PCI_CHIP_HASWELL_ULT_B1 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_B2 || (devid) == PCI_CHIP_HASWELL_ULT_B3 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_E1 || (devid) == PCI_CHIP_HASWELL_ULT_E2 || \
+	(devid) == PCI_CHIP_HASWELL_ULT_E3 || (devid) == PCI_CHIP_HASWELL_CRW_D1 || \
+	(devid) == PCI_CHIP_HASWELL_CRW_D2 || (devid) == PCI_CHIP_HASWELL_CRW_D3 || \
+	(devid) == PCI_CHIP_HASWELL_CRW_S1 || (devid) == PCI_CHIP_HASWELL_CRW_S2 || \
+	(devid) == PCI_CHIP_HASWELL_CRW_S3 || (devid) == PCI_CHIP_HASWELL_CRW_M1 || \
+	(devid) == PCI_CHIP_HASWELL_CRW_M2 || (devid) == PCI_CHIP_HASWELL_CRW_M3 || \
+	(devid) == PCI_CHIP_HASWELL_CRW_B1 || (devid) == PCI_CHIP_HASWELL_CRW_B2 || \
+	(devid) == PCI_CHIP_HASWELL_CRW_B3 || (devid) == PCI_CHIP_HASWELL_CRW_E1 || \
+	(devid) == PCI_CHIP_HASWELL_CRW_E2 || (devid) == PCI_CHIP_HASWELL_CRW_E3)
+
+#define IS_GEN75(devid)  IS_HASWELL(devid)
+
+#endif /* __CL_DEVICE_DATA_H__ */
+
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
new file mode 100644
index 0000000..ee3f2b7
--- /dev/null
+++ b/src/cl_device_id.c
@@ -0,0 +1,617 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_platform_id.h"
+#include "cl_device_id.h"
+#include "cl_internals.h"
+#include "cl_utils.h"
+#include "cl_driver.h"
+#include "cl_device_data.h"
+#include "cl_khr_icd.h"
+#include "cl_thread.h"
+#include "CL/cl.h"
+#include "cl_gbe_loader.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#ifndef CL_VERSION_1_2
+#define CL_DEVICE_BUILT_IN_KERNELS 0x103F
+#endif
+
+static struct _cl_device_id intel_ivb_gt2_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 16,
+  .max_thread_per_unit = 8,
+  .max_work_item_sizes = {1024, 1024, 1024},
+  .max_work_group_size = 1024,
+  .max_clock_frequency = 1000,
+#include "cl_gen7_device.h"
+};
+
+static struct _cl_device_id intel_ivb_gt1_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 6,
+  .max_thread_per_unit = 6,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen7_device.h"
+};
+
+static struct _cl_device_id intel_baytrail_t_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 4,
+  .max_thread_per_unit = 8,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen7_device.h"
+};
+
+/* XXX we clone IVB for HSW now */
+static struct _cl_device_id intel_hsw_gt1_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 10,
+  .max_thread_per_unit = 7,
+  .max_work_item_sizes = {1024, 1024, 1024},
+  .max_work_group_size = 1024,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+static struct _cl_device_id intel_hsw_gt2_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 20,
+  .max_thread_per_unit = 7,
+  .max_work_item_sizes = {1024, 1024, 1024},
+  .max_work_group_size = 1024,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+static struct _cl_device_id intel_hsw_gt3_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 40,
+  .max_thread_per_unit = 7,
+  .max_work_item_sizes = {1024, 1024, 1024},
+  .max_work_group_size = 1024,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+LOCAL cl_device_id
+cl_get_gt_device(void)
+{
+  cl_device_id ret = NULL;
+  const int device_id = cl_driver_get_device_id();
+  cl_device_id device = NULL;
+
+#define DECL_INFO_STRING(BREAK, STRUCT, FIELD, STRING) \
+    STRUCT.FIELD = STRING; \
+    STRUCT.JOIN(FIELD,_sz) = sizeof(STRING); \
+    device = &STRUCT; \
+    goto BREAK;
+
+  switch (device_id) {
+    case PCI_CHIP_HASWELL_D1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 Desktop");
+    case PCI_CHIP_HASWELL_D2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 Desktop");
+    case PCI_CHIP_HASWELL_D3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 Desktop");
+    case PCI_CHIP_HASWELL_S1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 Server");
+    case PCI_CHIP_HASWELL_S2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 Server");
+    case PCI_CHIP_HASWELL_S3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 Server");
+    case PCI_CHIP_HASWELL_M1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 Mobile");
+    case PCI_CHIP_HASWELL_M2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 Mobile");
+    case PCI_CHIP_HASWELL_M3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 Mobile");
+    case PCI_CHIP_HASWELL_B1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 reserved");
+    case PCI_CHIP_HASWELL_B2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 reserved");
+    case PCI_CHIP_HASWELL_B3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 reserved");
+    case PCI_CHIP_HASWELL_E1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 reserved");
+    case PCI_CHIP_HASWELL_E2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 reserved");
+    case PCI_CHIP_HASWELL_E3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 reserved");
+    case PCI_CHIP_HASWELL_SDV_D1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT1 Desktop");
+    case PCI_CHIP_HASWELL_SDV_D2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT2 Desktop");
+    case PCI_CHIP_HASWELL_SDV_D3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT3 Desktop");
+    case PCI_CHIP_HASWELL_SDV_S1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT1 Server");
+    case PCI_CHIP_HASWELL_SDV_S2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT2 Server");
+    case PCI_CHIP_HASWELL_SDV_S3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT3 Server");
+    case PCI_CHIP_HASWELL_SDV_M1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT1 Mobile");
+    case PCI_CHIP_HASWELL_SDV_M2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT2 Mobile");
+    case PCI_CHIP_HASWELL_SDV_M3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT3 Mobile");
+    case PCI_CHIP_HASWELL_SDV_B1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT1 reserved");
+    case PCI_CHIP_HASWELL_SDV_B2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT2 reserved");
+    case PCI_CHIP_HASWELL_SDV_B3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT3 reserved");
+    case PCI_CHIP_HASWELL_SDV_E1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT1 reserved");
+    case PCI_CHIP_HASWELL_SDV_E2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT2 reserved");
+    case PCI_CHIP_HASWELL_SDV_E3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
+                                                           " Software Development Vehicle device GT3 reserved");
+    case PCI_CHIP_HASWELL_ULT_D1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 Desktop");
+    case PCI_CHIP_HASWELL_ULT_D2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 Desktop");
+    case PCI_CHIP_HASWELL_ULT_D3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 Desktop");
+    case PCI_CHIP_HASWELL_ULT_S1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 Server");
+    case PCI_CHIP_HASWELL_ULT_S2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 Server");
+    case PCI_CHIP_HASWELL_ULT_S3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 Server");
+    case PCI_CHIP_HASWELL_ULT_M1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 Mobile");
+    case PCI_CHIP_HASWELL_ULT_M2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile");
+    case PCI_CHIP_HASWELL_ULT_M3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 Mobile");
+    case PCI_CHIP_HASWELL_ULT_B1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 reserved");
+    case PCI_CHIP_HASWELL_ULT_B2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 reserved");
+    case PCI_CHIP_HASWELL_ULT_B3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 reserved");
+    case PCI_CHIP_HASWELL_ULT_E1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 reserved");
+    case PCI_CHIP_HASWELL_ULT_E2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 reserved");
+    case PCI_CHIP_HASWELL_ULT_E3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 reserved");
+
+	/* CRW */
+    case PCI_CHIP_HASWELL_CRW_D1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 Desktop");
+    case PCI_CHIP_HASWELL_CRW_D2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 Desktop");
+    case PCI_CHIP_HASWELL_CRW_D3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 Desktop");
+    case PCI_CHIP_HASWELL_CRW_S1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 Server");
+    case PCI_CHIP_HASWELL_CRW_S2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 Server");
+    case PCI_CHIP_HASWELL_CRW_S3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 Server");
+    case PCI_CHIP_HASWELL_CRW_M1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 Mobile");
+    case PCI_CHIP_HASWELL_CRW_M2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 Mobile");
+    case PCI_CHIP_HASWELL_CRW_M3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 Mobile");
+    case PCI_CHIP_HASWELL_CRW_B1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 reserved");
+    case PCI_CHIP_HASWELL_CRW_B2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 reserved");
+    case PCI_CHIP_HASWELL_CRW_B3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 reserved");
+    case PCI_CHIP_HASWELL_CRW_E1:
+      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 reserved");
+    case PCI_CHIP_HASWELL_CRW_E2:
+      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 reserved");
+    case PCI_CHIP_HASWELL_CRW_E3:
+      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 reserved");
+has_break:
+      device->vendor_id = device_id;
+      device->platform = intel_platform;
+      ret = device;
+      break;
+
+    case PCI_CHIP_IVYBRIDGE_GT1:
+      DECL_INFO_STRING(ivb_gt1_break, intel_ivb_gt1_device, name, "Intel(R) HD Graphics IvyBridge GT1");
+    case PCI_CHIP_IVYBRIDGE_M_GT1:
+      DECL_INFO_STRING(ivb_gt1_break, intel_ivb_gt1_device, name, "Intel(R) HD Graphics IvyBridge M GT1");
+    case PCI_CHIP_IVYBRIDGE_S_GT1:
+      DECL_INFO_STRING(ivb_gt1_break, intel_ivb_gt1_device, name, "Intel(R) HD Graphics IvyBridge S GT1");
+ivb_gt1_break:
+      intel_ivb_gt1_device.vendor_id = device_id;
+      intel_ivb_gt1_device.platform = intel_platform;
+      ret = &intel_ivb_gt1_device;
+      break;
+
+    case PCI_CHIP_IVYBRIDGE_GT2:
+      DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge GT2");
+    case PCI_CHIP_IVYBRIDGE_M_GT2:
+      DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge M GT2");
+    case PCI_CHIP_IVYBRIDGE_S_GT2:
+      DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge S GT2");
+ivb_gt2_break:
+      intel_ivb_gt2_device.vendor_id = device_id;
+      intel_ivb_gt2_device.platform = intel_platform;
+      ret = &intel_ivb_gt2_device;
+      break;
+
+    case PCI_CHIP_BAYTRAIL_T:
+      DECL_INFO_STRING(baytrail_t_device_break, intel_baytrail_t_device, name, "Intel(R) HD Graphics Bay Trail-T");
+baytrail_t_device_break:
+      intel_baytrail_t_device.vendor_id = device_id;
+      intel_baytrail_t_device.platform = intel_platform;
+      ret = &intel_baytrail_t_device;
+      break;
+
+    case PCI_CHIP_SANDYBRIDGE_BRIDGE:
+    case PCI_CHIP_SANDYBRIDGE_GT1:
+    case PCI_CHIP_SANDYBRIDGE_GT2:
+    case PCI_CHIP_SANDYBRIDGE_GT2_PLUS:
+    case PCI_CHIP_SANDYBRIDGE_BRIDGE_M:
+    case PCI_CHIP_SANDYBRIDGE_M_GT1:
+    case PCI_CHIP_SANDYBRIDGE_M_GT2:
+    case PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS:
+    case PCI_CHIP_SANDYBRIDGE_BRIDGE_S:
+    case PCI_CHIP_SANDYBRIDGE_S_GT:
+      // Intel(R) HD Graphics SandyBridge not supported yet
+      ret = NULL;
+      break;
+    default:
+      printf("cl_get_gt_device(): error, unknown device: %x\n", device_id);
+  }
+
+  if (!CompilerSupported()) {
+    if (ret != NULL) {
+      ret->compiler_available = CL_FALSE;
+      //ret->linker_available = CL_FALSE;
+      ret->profile = "EMBEDDED_PROFILE";
+      ret->profile_sz = strlen(ret->profile) + 1;
+    }
+  }
+
+  return ret;
+}
+
+LOCAL cl_int
+cl_get_device_ids(cl_platform_id    platform,
+                  cl_device_type    device_type,
+                  cl_uint           num_entries,
+                  cl_device_id *    devices,
+                  cl_uint *         num_devices)
+{
+  cl_device_id device;
+
+  /* Do we have a usable device? */
+  device = cl_get_gt_device();
+  if (!device) {
+    if (num_devices)
+      *num_devices = 0;
+    if (devices)
+      *devices = 0;
+    return CL_DEVICE_NOT_FOUND;
+  } else {
+    if (num_devices)
+      *num_devices = 1;
+    if (devices) {
+      *devices = device;
+      (*devices)->extensions = intel_platform->extensions;
+      (*devices)->extensions_sz = intel_platform->extensions_sz;
+    }
+    return CL_SUCCESS;
+  }
+}
+
+#define DECL_FIELD(CASE,FIELD)                                      \
+  case JOIN(CL_DEVICE_,CASE):                                       \
+    if (param_value_size_ret) {                                     \
+      *param_value_size_ret = sizeof device->FIELD;                 \
+      if (!param_value)                                             \
+        return CL_SUCCESS;                                          \
+    }                                                               \
+    if (param_value_size < sizeof device->FIELD)                    \
+      return CL_INVALID_VALUE;                                      \
+    memcpy(param_value, &device->FIELD, sizeof device->FIELD);      \
+    return CL_SUCCESS;
+
+#define DECL_STRING_FIELD(CASE,FIELD)                               \
+  case JOIN(CL_DEVICE_,CASE):                                       \
+    if (param_value_size_ret) {                                     \
+      *param_value_size_ret = device->JOIN(FIELD,_sz);              \
+      if (!param_value)                                             \
+        return CL_SUCCESS;                                          \
+    }                                                               \
+    if (param_value_size < device->JOIN(FIELD,_sz))                 \
+      return CL_INVALID_VALUE;                                      \
+    memcpy(param_value, device->FIELD, device->JOIN(FIELD,_sz));    \
+    return CL_SUCCESS;
+
+LOCAL cl_int
+cl_get_device_info(cl_device_id     device,
+                   cl_device_info   param_name,
+                   size_t           param_value_size,
+                   void *           param_value,
+                   size_t *         param_value_size_ret)
+{
+  if (UNLIKELY(device != &intel_ivb_gt1_device &&
+               device != &intel_ivb_gt2_device &&
+               device != &intel_baytrail_t_device &&
+               device != &intel_hsw_gt1_device &&
+               device != &intel_hsw_gt2_device &&
+               device != &intel_hsw_gt3_device
+               ))
+    return CL_INVALID_DEVICE;
+
+  /* Find the correct parameter */
+  switch (param_name) {
+    DECL_FIELD(TYPE, device_type)
+    DECL_FIELD(VENDOR_ID, vendor_id)
+    DECL_FIELD(MAX_COMPUTE_UNITS, max_compute_unit)
+    DECL_FIELD(MAX_WORK_ITEM_DIMENSIONS, max_work_item_dimensions)
+    DECL_FIELD(MAX_WORK_ITEM_SIZES, max_work_item_sizes)
+    DECL_FIELD(MAX_WORK_GROUP_SIZE, max_work_group_size)
+    DECL_FIELD(PREFERRED_VECTOR_WIDTH_CHAR, preferred_vector_width_char)
+    DECL_FIELD(PREFERRED_VECTOR_WIDTH_SHORT, preferred_vector_width_short)
+    DECL_FIELD(PREFERRED_VECTOR_WIDTH_INT, preferred_vector_width_int)
+    DECL_FIELD(PREFERRED_VECTOR_WIDTH_LONG, preferred_vector_width_long)
+    DECL_FIELD(PREFERRED_VECTOR_WIDTH_FLOAT, preferred_vector_width_float)
+    DECL_FIELD(PREFERRED_VECTOR_WIDTH_DOUBLE, preferred_vector_width_double)
+    DECL_FIELD(PREFERRED_VECTOR_WIDTH_HALF, preferred_vector_width_half)
+    DECL_FIELD(NATIVE_VECTOR_WIDTH_CHAR, native_vector_width_char)
+    DECL_FIELD(NATIVE_VECTOR_WIDTH_SHORT, native_vector_width_short)
+    DECL_FIELD(NATIVE_VECTOR_WIDTH_INT, native_vector_width_int)
+    DECL_FIELD(NATIVE_VECTOR_WIDTH_LONG, native_vector_width_long)
+    DECL_FIELD(NATIVE_VECTOR_WIDTH_FLOAT, native_vector_width_float)
+    DECL_FIELD(NATIVE_VECTOR_WIDTH_DOUBLE, native_vector_width_double)
+    DECL_FIELD(NATIVE_VECTOR_WIDTH_HALF, native_vector_width_half)
+    DECL_FIELD(MAX_CLOCK_FREQUENCY, max_clock_frequency)
+    DECL_FIELD(ADDRESS_BITS, address_bits)
+    DECL_FIELD(MAX_MEM_ALLOC_SIZE, max_mem_alloc_size)
+    DECL_FIELD(IMAGE_SUPPORT, image_support)
+    DECL_FIELD(MAX_READ_IMAGE_ARGS, max_read_image_args)
+    DECL_FIELD(MAX_WRITE_IMAGE_ARGS, max_write_image_args)
+    DECL_FIELD(IMAGE_MAX_ARRAY_SIZE, image_max_array_size)
+    DECL_FIELD(IMAGE2D_MAX_WIDTH, image2d_max_width)
+    DECL_FIELD(IMAGE2D_MAX_HEIGHT, image2d_max_height)
+    DECL_FIELD(IMAGE3D_MAX_WIDTH, image3d_max_width)
+    DECL_FIELD(IMAGE3D_MAX_HEIGHT, image3d_max_height)
+    DECL_FIELD(IMAGE3D_MAX_DEPTH, image3d_max_depth)
+    DECL_FIELD(MAX_SAMPLERS, max_samplers)
+    DECL_FIELD(MAX_PARAMETER_SIZE, max_parameter_size)
+    DECL_FIELD(MEM_BASE_ADDR_ALIGN, mem_base_addr_align)
+    DECL_FIELD(MIN_DATA_TYPE_ALIGN_SIZE, min_data_type_align_size)
+    DECL_FIELD(SINGLE_FP_CONFIG, single_fp_config)
+    DECL_FIELD(DOUBLE_FP_CONFIG, double_fp_config)
+    DECL_FIELD(GLOBAL_MEM_CACHE_TYPE, global_mem_cache_type)
+    DECL_FIELD(GLOBAL_MEM_CACHELINE_SIZE, global_mem_cache_line_size)
+    DECL_FIELD(GLOBAL_MEM_CACHE_SIZE, global_mem_cache_size)
+    DECL_FIELD(GLOBAL_MEM_SIZE, global_mem_size)
+    DECL_FIELD(MAX_CONSTANT_BUFFER_SIZE, max_constant_buffer_size)
+    DECL_FIELD(IMAGE_MAX_BUFFER_SIZE, image_mem_size)
+    DECL_FIELD(MAX_CONSTANT_ARGS, max_constant_args)
+    DECL_FIELD(LOCAL_MEM_TYPE, local_mem_type)
+    DECL_FIELD(LOCAL_MEM_SIZE, local_mem_size)
+    DECL_FIELD(ERROR_CORRECTION_SUPPORT, error_correction_support)
+    DECL_FIELD(HOST_UNIFIED_MEMORY, host_unified_memory)
+    DECL_FIELD(PROFILING_TIMER_RESOLUTION, profiling_timer_resolution)
+    DECL_FIELD(ENDIAN_LITTLE, endian_little)
+    DECL_FIELD(AVAILABLE, available)
+    DECL_FIELD(COMPILER_AVAILABLE, compiler_available)
+    DECL_FIELD(LINKER_AVAILABLE, linker_available)
+    DECL_FIELD(EXECUTION_CAPABILITIES, execution_capabilities)
+    DECL_FIELD(QUEUE_PROPERTIES, queue_properties)
+    DECL_FIELD(PLATFORM, platform)
+    DECL_FIELD(PRINTF_BUFFER_SIZE, printf_buffer_size)
+    DECL_FIELD(PREFERRED_INTEROP_USER_SYNC, interop_user_sync)
+    DECL_STRING_FIELD(NAME, name)
+    DECL_STRING_FIELD(VENDOR, vendor)
+    DECL_STRING_FIELD(VERSION, version)
+    DECL_STRING_FIELD(PROFILE, profile)
+    DECL_STRING_FIELD(OPENCL_C_VERSION, opencl_c_version)
+    DECL_STRING_FIELD(EXTENSIONS, extensions);
+    DECL_STRING_FIELD(BUILT_IN_KERNELS, built_in_kernels)
+    DECL_FIELD(PARENT_DEVICE, parent_device)
+    DECL_FIELD(PARTITION_MAX_SUB_DEVICES, partition_max_sub_device)
+    DECL_FIELD(PARTITION_PROPERTIES, partition_property)
+    DECL_FIELD(PARTITION_AFFINITY_DOMAIN, affinity_domain)
+    DECL_FIELD(PARTITION_TYPE, partition_type)
+    DECL_FIELD(REFERENCE_COUNT, device_reference_count)
+
+    case CL_DRIVER_VERSION:
+      if (param_value_size_ret) {
+        *param_value_size_ret = device->driver_version_sz;
+        if (!param_value)
+          return CL_SUCCESS;
+      }
+      if (param_value_size < device->driver_version_sz)
+        return CL_INVALID_VALUE;
+      memcpy(param_value, device->driver_version, device->driver_version_sz);
+      return CL_SUCCESS;
+
+    default: return CL_INVALID_VALUE;
+  };
+}
+
+LOCAL cl_int
+cl_device_get_version(cl_device_id device, cl_int *ver)
+{
+  if (UNLIKELY(device != &intel_ivb_gt1_device &&
+               device != &intel_ivb_gt2_device &&
+               device != &intel_baytrail_t_device &&
+               device != &intel_hsw_gt1_device &&
+               device != &intel_hsw_gt2_device &&
+               device != &intel_hsw_gt3_device))
+    return CL_INVALID_DEVICE;
+  if (ver == NULL)
+    return CL_SUCCESS;
+  if (device == &intel_ivb_gt1_device || 
+      device == &intel_ivb_gt2_device ||
+      device == &intel_baytrail_t_device) {
+    *ver = 7;
+  } else if (device == &intel_hsw_gt1_device || device == &intel_hsw_gt2_device
+        || device == &intel_hsw_gt3_device) {
+    *ver = 75;
+  } else
+    return CL_INVALID_VALUE;
+
+  return CL_SUCCESS;
+}
+#undef DECL_FIELD
+
+#define _DECL_FIELD(FIELD)                                 \
+      if (param_value && param_value_size < sizeof(FIELD)) \
+        return CL_INVALID_VALUE;                           \
+      if (param_value_size_ret != NULL)                    \
+        *param_value_size_ret = sizeof(FIELD);             \
+      if (param_value)                                     \
+        memcpy(param_value, &FIELD, sizeof(FIELD));        \
+        return CL_SUCCESS;
+
+#define DECL_FIELD(CASE,FIELD)                             \
+  case JOIN(CL_KERNEL_,CASE):                              \
+  _DECL_FIELD(FIELD)
+
+#include "cl_kernel.h"
+#include "cl_program.h"
+static int
+cl_check_builtin_kernel_dimension(cl_kernel kernel, cl_device_id device)
+{
+  const char * n = cl_kernel_get_name(kernel);
+  const char * builtin_kernels_2d = "__cl_copy_image_2d_to_2d;__cl_copy_image_2d_to_buffer;__cl_copy_buffer_to_image_2d;__cl_fill_image_2d;__cl_fill_image_2d_array;";
+  const char * builtin_kernels_3d = "__cl_copy_image_3d_to_2d;__cl_copy_image_2d_to_3d;__cl_copy_image_3d_to_3d;__cl_copy_image_3d_to_buffer;__cl_copy_buffer_to_image_3d;__cl_fill_image_3d";
+    if (!strstr(device->built_in_kernels, n)){
+      return 0;
+    }else if(strstr(builtin_kernels_2d, n)){
+      return 2;
+    }else if(strstr(builtin_kernels_3d, n)){
+      return 3;
+    }else
+      return 1;
+
+}
+
+LOCAL size_t
+cl_get_kernel_max_wg_sz(cl_kernel kernel)
+{
+  size_t work_group_size;
+  int simd_width = interp_kernel_get_simd_width(kernel->opaque);
+  int vendor_id = kernel->program->ctx->device->vendor_id;
+  if (!interp_kernel_use_slm(kernel->opaque)) {
+    if (!IS_BAYTRAIL_T(vendor_id) || simd_width == 16)
+      work_group_size = simd_width * 64;
+    else
+      work_group_size = kernel->program->ctx->device->max_compute_unit *
+                        kernel->program->ctx->device->max_thread_per_unit * simd_width;
+  } else
+    work_group_size = kernel->program->ctx->device->max_work_group_size /
+                      (16 / simd_width);
+  return work_group_size;
+}
+
+LOCAL cl_int
+cl_get_kernel_workgroup_info(cl_kernel kernel,
+                             cl_device_id device,
+                             cl_kernel_work_group_info param_name,
+                             size_t param_value_size,
+                             void* param_value,
+                             size_t* param_value_size_ret)
+{
+  int err = CL_SUCCESS;
+  int dimension = 0;
+  if (UNLIKELY(device != &intel_ivb_gt1_device &&
+               device != &intel_ivb_gt2_device &&
+               device != &intel_baytrail_t_device &&
+               device != &intel_hsw_gt1_device &&
+               device != &intel_hsw_gt2_device &&
+               device != &intel_hsw_gt3_device))
+    return CL_INVALID_DEVICE;
+
+  CHECK_KERNEL(kernel);
+  switch (param_name) {
+    case CL_KERNEL_WORK_GROUP_SIZE:
+    {
+      if (param_value && param_value_size < sizeof(size_t))
+        return CL_INVALID_VALUE;
+      if (param_value_size_ret != NULL)
+        *param_value_size_ret = sizeof(size_t);
+      if (param_value) {
+        size_t work_group_size = cl_get_kernel_max_wg_sz(kernel);
+        *(size_t*)param_value = work_group_size;
+        return CL_SUCCESS;
+      }
+    }
+    DECL_FIELD(PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device->preferred_wg_sz_mul)
+    case CL_KERNEL_LOCAL_MEM_SIZE:
+    {
+      size_t local_mem_sz =  interp_kernel_get_slm_size(kernel->opaque) + kernel->local_mem_sz;
+      _DECL_FIELD(local_mem_sz)
+    }
+    DECL_FIELD(COMPILE_WORK_GROUP_SIZE, kernel->compile_wg_sz)
+    DECL_FIELD(PRIVATE_MEM_SIZE, kernel->stack_size)
+    case CL_KERNEL_GLOBAL_WORK_SIZE:
+      dimension = cl_check_builtin_kernel_dimension(kernel, device);
+      if ( !dimension ) return CL_INVALID_VALUE;
+      if (param_value_size_ret != NULL)
+        *param_value_size_ret = sizeof(device->max_1d_global_work_sizes);
+      if (param_value) {
+        if (dimension == 1) {
+          memcpy(param_value, device->max_1d_global_work_sizes, sizeof(device->max_1d_global_work_sizes));
+        }else if(dimension == 2){
+          memcpy(param_value, device->max_2d_global_work_sizes, sizeof(device->max_2d_global_work_sizes));
+        }else if(dimension == 3){
+          memcpy(param_value, device->max_3d_global_work_sizes, sizeof(device->max_3d_global_work_sizes));
+        }else
+          return CL_INVALID_VALUE;
+
+        return CL_SUCCESS;
+      }
+    default:
+      return CL_INVALID_VALUE;
+  };
+
+error:
+  return err;
+}
+
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
new file mode 100644
index 0000000..31bce47
--- /dev/null
+++ b/src/cl_device_id.h
@@ -0,0 +1,145 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_DEVICE_ID_H__
+#define __CL_DEVICE_ID_H__
+
+/* Store complete information about the device */
+struct _cl_device_id {
+  DEFINE_ICD(dispatch)
+  cl_device_type device_type;
+  cl_uint  vendor_id;
+  cl_uint  max_compute_unit;               // maximum EU number
+  cl_uint  max_thread_per_unit;            // maximum EU threads per EU.
+  cl_uint  max_work_item_dimensions;       // should be 3.
+  size_t   max_work_item_sizes[3];         // equal to maximum work group size.
+  size_t   max_work_group_size;            // maximum work group size under simd16 mode.
+  size_t   max_1d_global_work_sizes[3];       // maximum 1d global work size for builtin kernels.
+  size_t   max_2d_global_work_sizes[3];       // maximum 2d global work size for builtin kernels.
+  size_t   max_3d_global_work_sizes[3];       // maximum 3d global work size for builtin kernels.
+  cl_uint  preferred_vector_width_char;
+  cl_uint  preferred_vector_width_short;
+  cl_uint  preferred_vector_width_int;
+  cl_uint  preferred_vector_width_long;
+  cl_uint  preferred_vector_width_float;
+  cl_uint  preferred_vector_width_double;
+  cl_uint  preferred_vector_width_half;
+  cl_uint  native_vector_width_char;
+  cl_uint  native_vector_width_short;
+  cl_uint  native_vector_width_int;
+  cl_uint  native_vector_width_long;
+  cl_uint  native_vector_width_float;
+  cl_uint  native_vector_width_double;
+  cl_uint  native_vector_width_half;
+  cl_uint  max_clock_frequency;
+  cl_uint  address_bits;
+  cl_ulong max_mem_alloc_size;
+  cl_bool  image_support;
+  cl_uint  max_read_image_args;
+  cl_uint  max_write_image_args;
+  size_t   image2d_max_width;
+  size_t   image_max_array_size;
+  size_t   image2d_max_height;
+  size_t   image3d_max_width;
+  size_t   image3d_max_height;
+  size_t   image3d_max_depth;
+  cl_ulong image_mem_size;
+  cl_uint  max_samplers;
+  size_t   max_parameter_size;
+  cl_uint  mem_base_addr_align;
+  cl_uint  min_data_type_align_size;
+  cl_device_fp_config single_fp_config;
+  cl_device_fp_config double_fp_config;
+  cl_device_mem_cache_type global_mem_cache_type;
+  cl_uint  global_mem_cache_line_size;
+  cl_ulong global_mem_cache_size;
+  cl_ulong global_mem_size;
+  cl_ulong max_constant_buffer_size;
+  cl_uint  max_constant_args;
+  cl_device_local_mem_type local_mem_type;
+  cl_ulong local_mem_size;
+  cl_ulong scratch_mem_size;
+  cl_bool  error_correction_support;
+  cl_bool  host_unified_memory;
+  size_t   profiling_timer_resolution;
+  cl_bool  endian_little;
+  cl_bool  available;
+  cl_bool  compiler_available;
+  cl_bool  linker_available;
+  cl_device_exec_capabilities execution_capabilities;
+  cl_command_queue_properties queue_properties;
+  cl_platform_id platform;
+  size_t printf_buffer_size;
+  cl_bool interop_user_sync;
+  const char *name;
+  const char *vendor;
+  const char *version;
+  const char *profile;
+  const char *opencl_c_version;
+  const char *extensions;
+  const char *driver_version;
+  const char *built_in_kernels;
+  size_t name_sz;
+  size_t vendor_sz;
+  size_t version_sz;
+  size_t profile_sz;
+  size_t opencl_c_version_sz;
+  size_t extensions_sz;
+  size_t driver_version_sz;
+  size_t built_in_kernels_sz;
+  /* Kernel specific info that we're assigning statically */
+  size_t preferred_wg_sz_mul;
+  /* SubDevice specific info */
+  cl_device_id parent_device;
+  cl_uint      partition_max_sub_device;
+  cl_device_partition_property partition_property[3];
+  cl_device_affinity_domain    affinity_domain;
+  cl_device_partition_property partition_type[3];
+  cl_uint      device_reference_count;
+};
+
+/* Get a device from the given platform */
+extern cl_int cl_get_device_ids(cl_platform_id    platform,
+                                cl_device_type    device_type,
+                                cl_uint           num_entries,
+                                cl_device_id *    devices,
+                                cl_uint *         num_devices);
+
+/* Get the intel GPU device we currently have in this machine (if any) */
+extern cl_device_id cl_get_gt_device(void);
+
+/* Provide info about the device */
+extern cl_int cl_get_device_info(cl_device_id     device,
+                                 cl_device_info   param_name,
+                                 size_t           param_value_size,
+                                 void *           param_value,
+                                 size_t *         param_value_size_ret);
+
+extern cl_int cl_get_kernel_workgroup_info(cl_kernel kernel,
+                                           cl_device_id     device,
+                                           cl_kernel_work_group_info   param_name,
+                                           size_t           param_value_size,
+                                           void *           param_value,
+                                           size_t *         param_value_size_ret);
+/* Returns the Gen device ID */
+extern cl_int cl_device_get_version(cl_device_id device, cl_int *ver);
+extern size_t cl_get_kernel_max_wg_sz(cl_kernel);
+
+#endif /* __CL_DEVICE_ID_H__ */
+
diff --git a/src/cl_driver.cpp b/src/cl_driver.cpp
new file mode 100644
index 0000000..19ac4ae
--- /dev/null
+++ b/src/cl_driver.cpp
@@ -0,0 +1,40 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+extern "C" {
+#include "intel/intel_driver.h"
+#include "cl_utils.h"
+#include <stdlib.h>
+#include <string.h>
+}
+
+namespace
+{
+  /*! Just use c++ pre-main to initialize the call-backs */
+  struct OCLDriverCallBackInitializer
+  {
+    OCLDriverCallBackInitializer(void) {
+      intel_setup_callbacks();
+    }
+  };
+
+  /*! Set the call backs at pre-main time */
+  static OCLDriverCallBackInitializer cbInitializer;
+} /* namespace */
+
diff --git a/src/cl_driver.h b/src/cl_driver.h
new file mode 100644
index 0000000..9cdba98
--- /dev/null
+++ b/src/cl_driver.h
@@ -0,0 +1,383 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_DRIVER_H__
+#define __CL_DRIVER_H__
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "cl_driver_type.h"
+/* Various limitations we should remove actually */
+#define GEN_MAX_SURFACES 256
+#define GEN_MAX_SAMPLERS 16
+
+/**************************************************************************
+ * cl_driver:
+ * Hide behind some call backs the buffer allocation / deallocation ... This
+ * will allow us to make the use of a software performance simulator easier and
+ * to minimize the code specific for the HW and for the simulator
+ **************************************************************************/
+/* Create a new driver */
+typedef cl_driver (cl_driver_new_cb)(cl_context_prop);
+extern cl_driver_new_cb *cl_driver_new;
+
+/* Delete the driver */
+typedef void (cl_driver_delete_cb)(cl_driver);
+extern cl_driver_delete_cb *cl_driver_delete;
+
+/* Get the buffer manager from the driver */
+typedef cl_buffer_mgr (cl_driver_get_bufmgr_cb)(cl_driver);
+extern cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr;
+
+/* Get the Gen version from the driver */
+typedef uint32_t (cl_driver_get_ver_cb)(cl_driver);
+extern cl_driver_get_ver_cb *cl_driver_get_ver;
+
+/**************************************************************************
+ * GPGPU command streamer
+ **************************************************************************/
+/* Describe texture tiling */
+typedef enum cl_gpgpu_tiling {
+  GPGPU_NO_TILE = 0,
+  GPGPU_TILE_X  = 1,
+  GPGPU_TILE_Y  = 2,
+} cl_gpgpu_tiling;
+
+/* Cache control options for gen7 */
+typedef enum cl_cache_control {
+  cc_gtt      = 0x0,
+  cc_l3       = 0x1,
+  cc_llc      = 0x2,
+  cc_llc_l3   = 0x3
+} cl_cache_control;
+
+/* L3 Cache control options for gen75 */
+typedef enum cl_l3_cache_control {
+  l3cc_uc      = 0x0,
+  l3cc_ec       = 0x1
+} cl_l3_cache_control;
+
+/* LLCCC Cache control options for gen75 */
+typedef enum cl_llccc_cache_control {
+  llccc_pte      = 0x0<<1,
+  llccc_uc       = 0x1<<1,
+  llccc_ec       = 0x2<<1,
+  llccc_ucllc    = 0x3<<1
+} cl_llccc_cache_control;
+
+typedef enum gpu_command_status {
+  command_queued    = 3,
+  command_submitted = 2,
+  command_running   = 1,
+  command_complete  = 0
+} gpu_command_status;
+
+/* Use this structure to bind kernels in the gpgpu state */
+typedef struct cl_gpgpu_kernel {
+  const char *name;        /* kernel name and bo name */
+  uint32_t grf_blocks;     /* register blocks kernel wants (in 8 reg blocks) */
+  uint32_t curbe_sz;         /* total size of all curbes */
+  cl_buffer bo;            /* kernel code in the proper addr space */
+  int32_t barrierID;       /* barrierID for _this_ kernel */
+  uint32_t use_slm:1;      /* For gen7 (automatic barrier management) */
+  uint32_t thread_n:15;    /* For gen7 (automatic barrier management) */
+  uint32_t slm_sz;         /* For gen7 (automatic SLM allocation) */
+} cl_gpgpu_kernel;
+
+/* Create a new gpgpu state */
+typedef cl_gpgpu (cl_gpgpu_new_cb)(cl_driver);
+extern cl_gpgpu_new_cb *cl_gpgpu_new;
+
+/* Delete the gpgpu state */
+typedef void (cl_gpgpu_delete_cb)(cl_gpgpu);
+extern cl_gpgpu_delete_cb *cl_gpgpu_delete;
+
+/* Synchonize GPU with CPU */
+typedef void (cl_gpgpu_sync_cb)(void*);
+extern cl_gpgpu_sync_cb *cl_gpgpu_sync;
+
+/* Bind a regular unformatted buffer */
+typedef void (cl_gpgpu_bind_buf_cb)(cl_gpgpu, cl_buffer, uint32_t offset, uint32_t internal_offset, uint32_t size, uint8_t bti);
+extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
+
+/* bind samplers defined in both kernel and kernel args. */
+typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
+extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
+
+/* get the default cache control value. */
+typedef uint32_t (cl_gpgpu_get_cache_ctrl_cb)();
+extern cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl;
+/* Set a 2d texture */
+typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
+                                      uint32_t id,
+                                      cl_buffer obj_bo,
+                                      uint32_t obj_bo_offset,
+                                      uint32_t format,
+                                      uint32_t type,
+                                      int32_t w,
+                                      int32_t h,
+                                      int32_t depth,
+                                      int pitch,
+                                      cl_gpgpu_tiling tiling);
+
+extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image;
+
+/* Setup a stack */
+typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t size, uint32_t cchint);
+extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack;
+
+/* Setup scratch */
+typedef int (cl_gpgpu_set_scratch_cb)(cl_gpgpu, uint32_t per_thread_size);
+extern cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch;
+
+/* Configure internal state */
+typedef int (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry, int profiling);
+extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
+
+/* Set the buffer object where to report performance counters */
+typedef void (cl_gpgpu_set_perf_counters_cb)(cl_gpgpu, cl_buffer perf);
+extern cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters;
+
+/* Fills current curbe buffer with data */
+typedef int (cl_gpgpu_upload_curbes_cb)(cl_gpgpu, const void* data, uint32_t size);
+extern cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes;
+
+typedef cl_buffer (cl_gpgpu_alloc_constant_buffer_cb)(cl_gpgpu, uint32_t size, uint8_t bti);
+extern cl_gpgpu_alloc_constant_buffer_cb *cl_gpgpu_alloc_constant_buffer;
+
+/* Setup all indirect states */
+typedef void (cl_gpgpu_states_setup_cb)(cl_gpgpu, cl_gpgpu_kernel *kernel);
+extern cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup;
+
+/* Upload the constant samplers as specified inside the OCL kernel */
+typedef void (cl_gpgpu_upload_samplers_cb)(cl_gpgpu *state, const void *data, uint32_t n);
+extern cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers;
+
+/* Set a sampler */
+typedef void (cl_gpgpu_set_sampler_cb)(cl_gpgpu, uint32_t index, uint32_t non_normalized);
+extern cl_gpgpu_set_sampler_cb *cl_gpgpu_set_sampler;
+
+/* Allocate the batch buffer and return the BO used for the batch buffer */
+typedef int (cl_gpgpu_batch_reset_cb)(cl_gpgpu, size_t sz);
+extern cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset;
+
+/* Atomic begin, pipeline select, urb, pipeline state and constant buffer */
+typedef void (cl_gpgpu_batch_start_cb)(cl_gpgpu);
+extern cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start;
+
+/* atomic end with possibly inserted flush */
+typedef void (cl_gpgpu_batch_end_cb)(cl_gpgpu, int32_t flush_mode);
+extern cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end;
+
+/* Flush the command buffer */
+typedef void (cl_gpgpu_flush_cb)(cl_gpgpu);
+extern cl_gpgpu_flush_cb *cl_gpgpu_flush;
+
+/* new a event for a batch buffer */
+typedef cl_gpgpu_event (cl_gpgpu_event_new_cb)(cl_gpgpu);
+extern cl_gpgpu_event_new_cb *cl_gpgpu_event_new;
+
+/* update the batch buffer of this event */
+typedef int (cl_gpgpu_event_update_status_cb)(cl_gpgpu_event, int);
+extern cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status;
+
+/* flush the batch buffer of this event */
+typedef void (cl_gpgpu_event_flush_cb)(cl_gpgpu_event);
+extern cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush;
+
+/* cancel exec batch buffer of this event */
+typedef void (cl_gpgpu_event_cancel_cb)(cl_gpgpu_event);
+extern cl_gpgpu_event_cancel_cb *cl_gpgpu_event_cancel;
+
+/* delete a gpgpu event */
+typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event);
+extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete;
+
+/* Get a event time stamp */
+typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu, cl_gpgpu_event, int, uint64_t*);
+extern cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp;
+
+/* Get current GPU time stamp */
+typedef void (cl_gpgpu_event_get_gpu_cur_timestamp_cb)(cl_gpgpu, uint64_t*);
+extern cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp;
+
+/* Get current batch buffer handle */
+typedef void* (cl_gpgpu_ref_batch_buf_cb)(cl_gpgpu);
+extern cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf;
+
+/* Get release batch buffer handle */
+typedef void (cl_gpgpu_unref_batch_buf_cb)(void*);
+extern cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf;
+
+/* Set the printf buffer */
+typedef int (cl_gpgpu_set_printf_buffer_cb)(cl_gpgpu, uint32_t, uint32_t, uint32_t, uint8_t);
+extern cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer;
+
+/* get the printf buffer offset in the apeture*/
+typedef unsigned long (cl_gpgpu_reloc_printf_buffer_cb)(cl_gpgpu, uint32_t, uint32_t);
+extern cl_gpgpu_reloc_printf_buffer_cb *cl_gpgpu_reloc_printf_buffer;
+
+/* map the printf buffer */
+typedef void* (cl_gpgpu_map_printf_buffer_cb)(cl_gpgpu, uint32_t);
+extern cl_gpgpu_map_printf_buffer_cb *cl_gpgpu_map_printf_buffer;
+
+/* unmap the printf buffer */
+typedef void (cl_gpgpu_unmap_printf_buffer_cb)(cl_gpgpu, uint32_t);
+extern cl_gpgpu_unmap_printf_buffer_cb *cl_gpgpu_unmap_printf_buffer;
+
+/* release the printf buffer */
+typedef unsigned long (cl_gpgpu_release_printf_buffer_cb)(cl_gpgpu, uint32_t);
+extern cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer;
+
+/* Set the last printfset pointer */
+typedef int (cl_gpgpu_set_printf_info_cb)(cl_gpgpu, void *, size_t*);
+extern cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info;
+
+/* Get the last printfset pointer */
+typedef void* (cl_gpgpu_get_printf_info_cb)(cl_gpgpu, size_t*);
+extern cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info;
+
+/* Will spawn all threads */
+typedef void (cl_gpgpu_walker_cb)(cl_gpgpu,
+                                  uint32_t simd_sz,
+                                  uint32_t thread_n,
+                                  const size_t global_wk_off[3],
+                                  const size_t global_wk_sz[3],
+                                  const size_t local_wk_sz[3]);
+extern cl_gpgpu_walker_cb *cl_gpgpu_walker;
+
+/**************************************************************************
+ * Buffer
+ **************************************************************************/
+/* Allocate a buffer */
+typedef cl_buffer (cl_buffer_alloc_cb)(cl_buffer_mgr, const char*, size_t, size_t);
+extern cl_buffer_alloc_cb *cl_buffer_alloc;
+
+/* Set a buffer's tiling mode */
+typedef cl_buffer (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride);
+extern cl_buffer_set_tiling_cb *cl_buffer_set_tiling;
+
+#include "cl_context.h"
+#include "cl_mem.h"
+typedef struct _cl_context *cl_context;
+
+typedef cl_buffer (cl_buffer_alloc_from_texture_cb)(cl_context, unsigned int, int, unsigned int,
+                                                    struct _cl_mem_image *gl_image);
+extern cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture;
+
+typedef void (cl_buffer_release_from_texture_cb)(cl_context, unsigned int, int, unsigned int);
+extern cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture;
+
+typedef cl_buffer (cl_buffer_get_buffer_from_libva_cb)(cl_context ctx, unsigned int bo_name, size_t *sz);
+extern cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva;
+
+typedef cl_buffer (cl_buffer_get_image_from_libva_cb)(cl_context ctx, unsigned int bo_name, struct _cl_mem_image *image, unsigned int offset);
+extern cl_buffer_get_image_from_libva_cb *cl_buffer_get_image_from_libva;
+
+/* Unref a buffer and destroy it if no more ref */
+typedef int (cl_buffer_unreference_cb)(cl_buffer);
+extern cl_buffer_unreference_cb *cl_buffer_unreference;
+
+/* Add one more ref on a buffer */
+typedef void (cl_buffer_reference_cb)(cl_buffer);
+extern cl_buffer_reference_cb *cl_buffer_reference;
+
+/* Map a buffer */
+typedef int (cl_buffer_map_cb)(cl_buffer, uint32_t write_enable);
+extern cl_buffer_map_cb *cl_buffer_map;
+
+/* Unmap a buffer */
+typedef int (cl_buffer_unmap_cb)(cl_buffer);
+extern cl_buffer_unmap_cb *cl_buffer_unmap;
+
+/* Map a buffer in the GTT domain */
+typedef int (cl_buffer_map_gtt_cb)(cl_buffer);
+extern cl_buffer_map_gtt_cb *cl_buffer_map_gtt;
+
+/* Map a buffer in the GTT domain, non waiting the GPU read or write*/
+typedef int (cl_buffer_map_gtt_unsync_cb)(cl_buffer);
+extern cl_buffer_map_gtt_unsync_cb *cl_buffer_map_gtt_unsync;
+
+/* Unmap a buffer in the GTT domain */
+typedef int (cl_buffer_unmap_gtt_cb)(cl_buffer);
+extern cl_buffer_unmap_gtt_cb *cl_buffer_unmap_gtt;
+
+/* Get the virtual address (when mapped) */
+typedef void* (cl_buffer_get_virtual_cb)(cl_buffer);
+extern cl_buffer_get_virtual_cb *cl_buffer_get_virtual;
+
+/* Get the size of the buffer */
+typedef size_t (cl_buffer_get_size_cb)(cl_buffer);
+extern cl_buffer_get_size_cb *cl_buffer_get_size;
+
+/* Pin a buffer */
+typedef int (cl_buffer_pin_cb)(cl_buffer, uint32_t alignment);
+extern cl_buffer_pin_cb *cl_buffer_pin;
+
+/* Unpin a buffer */
+typedef int (cl_buffer_unpin_cb)(cl_buffer);
+extern cl_buffer_unpin_cb *cl_buffer_unpin;
+
+/* Fill data in the buffer */
+typedef int (cl_buffer_subdata_cb)(cl_buffer, unsigned long, unsigned long, const void*);
+extern cl_buffer_subdata_cb *cl_buffer_subdata;
+
+/* Wait for all pending rendering for this buffer to complete */
+typedef int (cl_buffer_wait_rendering_cb) (cl_buffer);
+extern cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering;
+
+typedef int (cl_buffer_get_fd_cb)(cl_buffer, int *fd);
+extern cl_buffer_get_fd_cb *cl_buffer_get_fd;
+
+/* Get the device id */
+typedef int (cl_driver_get_device_id_cb)(void);
+extern cl_driver_get_device_id_cb *cl_driver_get_device_id;
+
+/**************************************************************************
+ * cl_khr_gl_sharing.
+ **************************************************************************/
+typedef int (cl_gl_acquire_texture_cb)(void *driver, void *ctx, int target,
+                                       int level, int texture, void*user_data);
+extern cl_gl_acquire_texture_cb *cl_gl_acquire_texture;
+
+typedef int (cl_gl_release_texture_cb)(void *driver, void *ctx, int target,
+                                       int level, int texture);
+extern cl_gl_release_texture_cb *cl_gl_release_texture;
+
+typedef int (cl_gl_acquire_buffer_object_cb)(void *driver, void *ctx,
+                                             int bufobj, void* user_data);
+extern cl_gl_acquire_buffer_object_cb *cl_gl_acquire_buffer_object;
+
+typedef int (cl_gl_release_buffer_object_cb)(void *driver, void *ctx, int bufobj);
+extern cl_gl_release_buffer_object_cb *cl_gl_release_buffer_object;
+
+typedef int (cl_gl_acquire_render_buffer_cb)(void *driver, void *ctx,
+                                             int rb, void* user_data);
+extern cl_gl_acquire_render_buffer_cb *cl_gl_acquire_render_buffer;
+
+typedef int (cl_gl_release_render_buffer_cb)(void *driver, void *ctx, int rb);
+extern cl_gl_release_render_buffer_cb *cl_gl_release_render_buffer;
+
+#ifndef DEFAULT_DRIVER_DIR
+/* this is normally defined in Mesa/configs/default with DRI_DRIVER_SEARCH_PATH */
+#define DEFAULT_DRIVER_DIR "/usr/local/lib/dri"
+#endif
+
+#endif /* __CL_DRIVER_H__ */
+
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
new file mode 100644
index 0000000..72f25d9
--- /dev/null
+++ b/src/cl_driver_defs.c
@@ -0,0 +1,95 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "cl_driver.h"
+#include "cl_utils.h"
+#include <stdlib.h>
+
+/* Driver */
+LOCAL cl_driver_new_cb *cl_driver_new = NULL;
+LOCAL cl_driver_delete_cb *cl_driver_delete = NULL;
+LOCAL cl_driver_get_bufmgr_cb *cl_driver_get_bufmgr = NULL;
+LOCAL cl_driver_get_ver_cb *cl_driver_get_ver = NULL;
+LOCAL cl_driver_get_device_id_cb *cl_driver_get_device_id = NULL;
+
+/* Buffer */
+LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
+LOCAL cl_buffer_set_tiling_cb *cl_buffer_set_tiling = NULL;
+LOCAL cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture = NULL;
+LOCAL cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture = NULL;
+LOCAL cl_buffer_reference_cb *cl_buffer_reference = NULL;
+LOCAL cl_buffer_unreference_cb *cl_buffer_unreference = NULL;
+LOCAL cl_buffer_map_cb *cl_buffer_map = NULL;
+LOCAL cl_buffer_unmap_cb *cl_buffer_unmap = NULL;
+LOCAL cl_buffer_map_gtt_cb *cl_buffer_map_gtt = NULL;
+LOCAL cl_buffer_map_gtt_unsync_cb *cl_buffer_map_gtt_unsync = NULL;
+LOCAL cl_buffer_unmap_gtt_cb *cl_buffer_unmap_gtt = NULL;
+LOCAL cl_buffer_get_virtual_cb *cl_buffer_get_virtual = NULL;
+LOCAL cl_buffer_get_size_cb *cl_buffer_get_size = NULL;
+LOCAL cl_buffer_pin_cb *cl_buffer_pin = NULL;
+LOCAL cl_buffer_unpin_cb *cl_buffer_unpin = NULL;
+LOCAL cl_buffer_subdata_cb *cl_buffer_subdata = NULL;
+LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
+LOCAL cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva = NULL;
+LOCAL cl_buffer_get_image_from_libva_cb *cl_buffer_get_image_from_libva = NULL;
+LOCAL cl_buffer_get_fd_cb *cl_buffer_get_fd = NULL;
+
+/* cl_khr_gl_sharing */
+LOCAL cl_gl_acquire_texture_cb *cl_gl_acquire_texture = NULL;
+LOCAL cl_gl_release_texture_cb *cl_gl_release_texture = NULL;
+LOCAL cl_gl_acquire_buffer_object_cb *cl_gl_acquire_buffer_object = NULL;
+LOCAL cl_gl_release_buffer_object_cb *cl_gl_release_buffer_object = NULL;
+LOCAL cl_gl_acquire_render_buffer_cb *cl_gl_acquire_render_buffer = NULL;
+LOCAL cl_gl_release_render_buffer_cb *cl_gl_release_render_buffer = NULL;
+/* GPGPU */
+LOCAL cl_gpgpu_new_cb *cl_gpgpu_new = NULL;
+LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
+LOCAL cl_gpgpu_sync_cb *cl_gpgpu_sync = NULL;
+LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
+LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
+LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
+LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
+LOCAL cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl = NULL;
+LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
+LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
+LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
+LOCAL cl_gpgpu_upload_curbes_cb *cl_gpgpu_upload_curbes = NULL;
+LOCAL cl_gpgpu_states_setup_cb *cl_gpgpu_states_setup = NULL;
+LOCAL cl_gpgpu_upload_samplers_cb *cl_gpgpu_upload_samplers = NULL;
+LOCAL cl_gpgpu_batch_reset_cb *cl_gpgpu_batch_reset = NULL;
+LOCAL cl_gpgpu_batch_start_cb *cl_gpgpu_batch_start = NULL;
+LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL;
+LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL;
+LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
+LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL;
+LOCAL cl_gpgpu_event_new_cb *cl_gpgpu_event_new = NULL;
+LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
+LOCAL cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush = NULL;
+LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL;
+LOCAL cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp = NULL;
+LOCAL cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp = NULL;
+LOCAL cl_gpgpu_ref_batch_buf_cb *cl_gpgpu_ref_batch_buf = NULL;
+LOCAL cl_gpgpu_unref_batch_buf_cb *cl_gpgpu_unref_batch_buf = NULL;
+LOCAL cl_gpgpu_set_printf_buffer_cb *cl_gpgpu_set_printf_buffer = NULL;
+LOCAL cl_gpgpu_reloc_printf_buffer_cb *cl_gpgpu_reloc_printf_buffer = NULL;
+LOCAL cl_gpgpu_map_printf_buffer_cb *cl_gpgpu_map_printf_buffer = NULL;
+LOCAL cl_gpgpu_unmap_printf_buffer_cb *cl_gpgpu_unmap_printf_buffer = NULL;
+LOCAL cl_gpgpu_set_printf_info_cb *cl_gpgpu_set_printf_info = NULL;
+LOCAL cl_gpgpu_get_printf_info_cb *cl_gpgpu_get_printf_info = NULL;
+LOCAL cl_gpgpu_release_printf_buffer_cb *cl_gpgpu_release_printf_buffer = NULL;
+
diff --git a/src/cl_driver_type.h b/src/cl_driver_type.h
new file mode 100644
index 0000000..891a33c
--- /dev/null
+++ b/src/cl_driver_type.h
@@ -0,0 +1,24 @@
+/**************************************************************************
+ * cl_driver:
+ * Hide behind some call backs the buffer allocation / deallocation ... This
+ * will allow us to make the use of a software performance simulator easier and
+ * to minimize the code specific for the HW and for the simulator
+ **************************************************************************/
+
+/* Encapsulates command buffer / data buffer / kernels */
+typedef struct _cl_buffer *cl_buffer;
+
+/* Encapsulates buffer manager */
+typedef struct _cl_buffer_mgr *cl_buffer_mgr;
+
+/* Encapsulates the driver backend functionalities */
+typedef struct _cl_driver *cl_driver;
+
+/* Encapsulates the gpgpu stream of commands */
+typedef struct _cl_gpgpu *cl_gpgpu;
+
+/* Encapsulates the event  of a command stream */
+typedef struct _cl_gpgpu_event *cl_gpgpu_event;
+
+typedef struct _cl_context_prop *cl_context_prop;
+typedef struct _cl_sampler *cl_sampler;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
new file mode 100644
index 0000000..af118ad
--- /dev/null
+++ b/src/cl_enqueue.c
@@ -0,0 +1,472 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang <rong.r.yang at intel.com>
+ */
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "cl_enqueue.h"
+#include "cl_image.h"
+#include "cl_driver.h"
+#include "cl_event.h"
+#include "cl_command_queue.h"
+#include "cl_utils.h"
+
+
+cl_int cl_enqueue_read_buffer(enqueue_data* data)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = data->mem_obj;
+  assert(mem->type == CL_MEM_BUFFER_TYPE ||
+         mem->type == CL_MEM_SUBBUFFER_TYPE);
+  void* src_ptr;
+  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+
+  if (!(src_ptr = cl_mem_map_auto(data->mem_obj))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  memcpy(data->ptr, (char*)src_ptr + data->offset + buffer->sub_offset, data->size);
+
+  err = cl_mem_unmap_auto(data->mem_obj);
+
+error:
+  return err;
+}
+
+cl_int cl_enqueue_read_buffer_rect(enqueue_data* data)
+{
+  cl_int err = CL_SUCCESS;
+  void* src_ptr;
+  void* dst_ptr;
+
+  const size_t* origin = data->origin;
+  const size_t* host_origin = data->host_origin;
+  const size_t* region = data->region;
+
+  cl_mem mem = data->mem_obj;
+  assert(mem->type == CL_MEM_BUFFER_TYPE ||
+         mem->type == CL_MEM_SUBBUFFER_TYPE);
+  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+
+  if (!(src_ptr = cl_mem_map_auto(mem))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+   size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2];
+   src_ptr = (char*)src_ptr + offset +  buffer->sub_offset;
+
+   offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2];
+   dst_ptr = (char *)data->ptr + offset;
+
+   if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
+       (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data->slice_pitch == data->host_slice_pitch)))
+   {
+     memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+   }
+   else {
+     cl_uint y, z;
+     for (z = 0; z < region[2]; z++) {
+       const char* src = src_ptr;
+       char* dst = dst_ptr;
+       for (y = 0; y < region[1]; y++) {
+         memcpy(dst, src, region[0]);
+         src += data->row_pitch;
+         dst += data->host_row_pitch;
+       }
+       src_ptr = (char*)src_ptr + data->slice_pitch;
+       dst_ptr = (char*)dst_ptr + data->host_slice_pitch;
+     }
+   }
+
+  err = cl_mem_unmap_auto(mem);
+
+error:
+  return err;
+}
+
+cl_int cl_enqueue_write_buffer(enqueue_data *data)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = data->mem_obj;
+  assert(mem->type == CL_MEM_BUFFER_TYPE ||
+         mem->type == CL_MEM_SUBBUFFER_TYPE);
+  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+  void* dst_ptr;
+
+  if (!(dst_ptr = cl_mem_map_auto(data->mem_obj))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  memcpy((char*)dst_ptr + data->offset + buffer->sub_offset, data->const_ptr, data->size);
+
+  err = cl_mem_unmap_auto(data->mem_obj);
+
+error:
+  return err;
+}
+
+cl_int cl_enqueue_write_buffer_rect(enqueue_data *data)
+{
+  cl_int err = CL_SUCCESS;
+  void* src_ptr;
+  void* dst_ptr;
+
+  const size_t* origin = data->origin;
+  const size_t* host_origin = data->host_origin;
+  const size_t* region = data->region;
+
+  cl_mem mem = data->mem_obj;
+  assert(mem->type == CL_MEM_BUFFER_TYPE ||
+         mem->type == CL_MEM_SUBBUFFER_TYPE);
+  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+
+  if (!(dst_ptr = cl_mem_map_auto(mem))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2];
+  dst_ptr = (char *)dst_ptr + offset + buffer->sub_offset;
+
+  offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2];
+  src_ptr = (char*)data->const_ptr + offset;
+
+  if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch &&
+      (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data->slice_pitch == data->host_slice_pitch)))
+  {
+    memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+  }
+  else {
+    cl_uint y, z;
+    for (z = 0; z < region[2]; z++) {
+      const char* src = src_ptr;
+      char* dst = dst_ptr;
+      for (y = 0; y < region[1]; y++) {
+        memcpy(dst, src, region[0]);
+        src += data->host_row_pitch;
+        dst += data->row_pitch;
+      }
+      src_ptr = (char*)src_ptr + data->host_slice_pitch;
+      dst_ptr = (char*)dst_ptr + data->slice_pitch;
+    }
+  }
+
+  err = cl_mem_unmap_auto(mem);
+
+error:
+  return err;
+}
+
+
+cl_int cl_enqueue_read_image(enqueue_data *data)
+{
+  cl_int err = CL_SUCCESS;
+  void* src_ptr;
+
+  cl_mem mem = data->mem_obj;
+  CHECK_IMAGE(mem, image);
+  const size_t* origin = data->origin;
+  const size_t* region = data->region;
+
+  if (!(src_ptr = cl_mem_map_auto(mem))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+
+  size_t offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2];
+  src_ptr = (char*)src_ptr + offset;
+
+  if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch &&
+      (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch)))
+  {
+    memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]);
+  }
+  else {
+    cl_uint y, z;
+    for (z = 0; z < region[2]; z++) {
+      const char* src = src_ptr;
+      char* dst = data->ptr;
+      for (y = 0; y < region[1]; y++) {
+        memcpy(dst, src, image->bpp*region[0]);
+        src += image->row_pitch;
+        dst += data->row_pitch;
+      }
+      src_ptr = (char*)src_ptr + image->slice_pitch;
+      data->ptr = (char*)data->ptr + data->slice_pitch;
+    }
+  }
+
+ err = cl_mem_unmap_auto(mem);
+
+error:
+  return err;
+
+}
+
+cl_int cl_enqueue_write_image(enqueue_data *data)
+{
+  cl_int err = CL_SUCCESS;
+  void* dst_ptr;
+
+  cl_mem mem = data->mem_obj;
+  CHECK_IMAGE(mem, image);
+
+  if (!(dst_ptr = cl_mem_map_auto(mem))) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+  //dst need to add offset
+  cl_mem_copy_image_region(data->origin, data->region, dst_ptr,
+                           image->row_pitch, image->slice_pitch,
+                           data->const_ptr, data->row_pitch,
+                           data->slice_pitch, image, CL_TRUE, CL_FALSE);
+  err = cl_mem_unmap_auto(mem);
+
+error:
+  return err;
+
+}
+
+cl_int cl_enqueue_map_buffer(enqueue_data *data)
+{
+  void *ptr = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = data->mem_obj;
+  assert(mem->type == CL_MEM_BUFFER_TYPE ||
+         mem->type == CL_MEM_SUBBUFFER_TYPE);
+  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+
+  if(data->unsync_map == 1)
+    //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
+    ptr = cl_mem_map_gtt(mem);
+  else
+    ptr = cl_mem_map_auto(mem);
+
+  if (ptr == NULL) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+  data->ptr = ptr;
+
+  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+    assert(mem->host_ptr);
+    ptr = (char*)ptr + data->offset + buffer->sub_offset;
+    memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data->size);
+  }
+
+error:
+  return err;
+}
+
+cl_int cl_enqueue_map_image(enqueue_data *data)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = data->mem_obj;
+  void *ptr = NULL;
+  size_t row_pitch = 0;
+  CHECK_IMAGE(mem, image);
+
+  if(data->unsync_map == 1)
+    //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
+    ptr = cl_mem_map_gtt(mem);
+  else
+    ptr = cl_mem_map_auto(mem);
+
+  if (ptr == NULL) {
+    err = CL_MAP_FAILURE;
+    goto error;
+  }
+  data->ptr = ptr;
+  if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+    row_pitch = image->slice_pitch;
+  else
+    row_pitch = image->row_pitch;
+
+  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+    assert(mem->host_ptr);
+    //src and dst need add offset in function cl_mem_copy_image_region
+    cl_mem_copy_image_region(data->origin, data->region,
+                             mem->host_ptr, image->host_row_pitch, image->host_slice_pitch,
+                             data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE, CL_TRUE);
+  }
+
+error:
+  return err;
+}
+
+cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
+{
+  cl_int err = CL_SUCCESS;
+  int i, j;
+  size_t mapped_size = 0;
+  size_t origin[3], region[3];
+  void * v_ptr = NULL;
+  void * mapped_ptr = data->ptr;
+  cl_mem memobj = data->mem_obj;
+  size_t row_pitch = 0;
+
+  assert(memobj->mapped_ptr_sz >= memobj->map_ref);
+  INVALID_VALUE_IF(!mapped_ptr);
+  for (i = 0; i < memobj->mapped_ptr_sz; i++) {
+    if (memobj->mapped_ptr[i].ptr == mapped_ptr) {
+      memobj->mapped_ptr[i].ptr = NULL;
+      mapped_size = memobj->mapped_ptr[i].size;
+      v_ptr = memobj->mapped_ptr[i].v_ptr;
+      for(j=0; j<3; j++) {
+        region[j] = memobj->mapped_ptr[i].region[j];
+        origin[j] = memobj->mapped_ptr[i].origin[j];
+        memobj->mapped_ptr[i].region[j] = 0;
+        memobj->mapped_ptr[i].origin[j] = 0;
+      }
+      memobj->mapped_ptr[i].size = 0;
+      memobj->mapped_ptr[i].v_ptr = NULL;
+      memobj->map_ref--;
+      break;
+    }
+  }
+  /* can not find a mapped address? */
+  INVALID_VALUE_IF(i == memobj->mapped_ptr_sz);
+
+  if (memobj->flags & CL_MEM_USE_HOST_PTR) {
+    if(memobj->type == CL_MEM_BUFFER_TYPE ||
+       memobj->type == CL_MEM_SUBBUFFER_TYPE) {
+      assert(mapped_ptr >= memobj->host_ptr &&
+        mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
+      /* Sync the data. */
+      memcpy(v_ptr, mapped_ptr, mapped_size);
+    } else {
+      CHECK_IMAGE(memobj, image);
+
+      if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+        row_pitch = image->slice_pitch;
+      else
+        row_pitch = image->row_pitch;
+      //v_ptr have added offset, host_ptr have not added offset.
+      cl_mem_copy_image_region(origin, region, v_ptr, row_pitch, image->slice_pitch,
+                               memobj->host_ptr, image->host_row_pitch, image->host_slice_pitch,
+                               image, CL_FALSE, CL_TRUE);
+    }
+  } else {
+    assert(v_ptr == mapped_ptr);
+  }
+
+  cl_mem_unmap_auto(memobj);
+
+  /* shrink the mapped slot. */
+  if (memobj->mapped_ptr_sz/2 > memobj->map_ref) {
+    int j = 0;
+    cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+                             sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz/2));
+    if (!new_ptr) {
+      /* Just do nothing. */
+      goto error;
+    }
+    memset(new_ptr, 0, (memobj->mapped_ptr_sz/2) * sizeof(cl_mapped_ptr));
+
+    for (i = 0; i < memobj->mapped_ptr_sz; i++) {
+      if (memobj->mapped_ptr[i].ptr) {
+        new_ptr[j] = memobj->mapped_ptr[i];
+        j++;
+        assert(j < memobj->mapped_ptr_sz/2);
+      }
+    }
+    memobj->mapped_ptr_sz = memobj->mapped_ptr_sz/2;
+    free(memobj->mapped_ptr);
+    memobj->mapped_ptr = new_ptr;
+  }
+
+error:
+  return err;
+}
+
+cl_int cl_enqueue_native_kernel(enqueue_data *data)
+{
+  cl_int err = CL_SUCCESS;
+  cl_uint num_mem_objects = (cl_uint)data->offset;
+  const cl_mem *mem_list = data->mem_list;
+  const void **args_mem_loc = (const void **)data->const_ptr;
+  cl_uint i;
+
+  for (i=0; i<num_mem_objects; ++i)
+  {
+      const cl_mem buffer = mem_list[i];
+      CHECK_MEM(buffer);
+
+      *((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer);
+  }
+  data->user_func(data->ptr);
+
+  for (i=0; i<num_mem_objects; ++i)
+  {
+      cl_mem_unmap_auto(mem_list[i]);
+  }
+
+  free(data->ptr);
+error:
+  return err;
+}
+
+cl_int cl_enqueue_handle(cl_event event, enqueue_data* data)
+{
+  /* if need profiling, add the submit timestamp here. */
+  if (event && event->type != CL_COMMAND_USER
+           && event->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+    cl_event_get_timestamp(event, CL_PROFILING_COMMAND_SUBMIT);
+  }
+
+  switch(data->type) {
+    case EnqueueReadBuffer:
+      return cl_enqueue_read_buffer(data);
+    case EnqueueReadBufferRect:
+      return cl_enqueue_read_buffer_rect(data);
+    case EnqueueWriteBuffer:
+      return cl_enqueue_write_buffer(data);
+    case EnqueueWriteBufferRect:
+      return cl_enqueue_write_buffer_rect(data);
+    case EnqueueReadImage:
+      return cl_enqueue_read_image(data);
+    case EnqueueWriteImage:
+      return cl_enqueue_write_image(data);
+    case EnqueueMapBuffer:
+      return cl_enqueue_map_buffer(data);
+    case EnqueueMapImage:
+      return cl_enqueue_map_image(data);
+    case EnqueueUnmapMemObject:
+      return cl_enqueue_unmap_mem_object(data);
+    case EnqueueCopyBufferRect:
+    case EnqueueCopyBuffer:
+    case EnqueueCopyImage:
+    case EnqueueCopyBufferToImage:
+    case EnqueueCopyImageToBuffer:
+    case EnqueueNDRangeKernel:
+    case EnqueueFillBuffer:
+    case EnqueueFillImage:
+      cl_event_flush(event);
+      return CL_SUCCESS;
+    case EnqueueNativeKernel:
+      return cl_enqueue_native_kernel(data);
+    case EnqueueMigrateMemObj:
+    default:
+      return CL_SUCCESS;
+  }
+}
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
new file mode 100644
index 0000000..a9b3601
--- /dev/null
+++ b/src/cl_enqueue.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang <rong.r.yang at intel.com>
+ */
+#ifndef __CL_ENQUEUE_H__
+#define __CL_ENQUEUE_H__
+
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "CL/cl.h"
+
+typedef enum {
+  EnqueueReadBuffer = 0,
+  EnqueueReadBufferRect,
+  EnqueueWriteBuffer,
+  EnqueueWriteBufferRect,
+  EnqueueCopyBuffer,
+  EnqueueCopyBufferRect,
+  EnqueueReadImage,
+  EnqueueWriteImage,
+  EnqueueCopyImage,
+  EnqueueCopyImageToBuffer,
+  EnqueueCopyBufferToImage,
+  EnqueueMapBuffer,
+  EnqueueMapImage,
+  EnqueueUnmapMemObject,
+  EnqueueNDRangeKernel,
+  EnqueueNativeKernel,
+  EnqueueMarker,
+  EnqueueBarrier,
+  EnqueueFillBuffer,
+  EnqueueFillImage,
+  EnqueueMigrateMemObj,
+  EnqueueInvalid
+} enqueue_type;
+
+typedef struct _enqueue_data {
+  enqueue_type      type;             /* Command type */
+  cl_mem            mem_obj;          /* Enqueue's cl_mem */
+  cl_command_queue  queue;            /* Command queue */
+  size_t            offset;           /* Mem object's offset */
+  size_t            size;             /* Size */
+  size_t            origin[3];        /* Origin */
+  size_t            host_origin[3];   /* Origin */
+  size_t            region[3];        /* Region */
+  size_t            row_pitch;        /* Row pitch */
+  size_t            slice_pitch;      /* Slice pitch */
+  size_t            host_row_pitch;   /* Host row pitch, used in read/write buffer rect */
+  size_t            host_slice_pitch; /* Host slice pitch, used in read/write buffer rect */
+  const void *      const_ptr;        /* Const ptr for memory read */
+  void *            ptr;              /* Ptr for write and return value */
+  const cl_mem*     mem_list;         /* mem_list of clEnqueueNativeKernel */
+  uint8_t           unsync_map;       /* Indicate the clEnqueueMapBuffer/Image is unsync map */
+  void (*user_func)(void *);          /* pointer to a host-callable user function */
+} enqueue_data;
+
+/* Do real enqueue commands */
+cl_int cl_enqueue_handle(cl_event event, enqueue_data* data);
+#endif /* __CL_ENQUEUE_H__ */
diff --git a/src/cl_event.c b/src/cl_event.c
new file mode 100644
index 0000000..99e60eb
--- /dev/null
+++ b/src/cl_event.c
@@ -0,0 +1,650 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang <rong.r.yang at intel.com>
+ */
+
+#include "cl_event.h"
+#include "cl_context.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+#include "cl_command_queue.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+inline cl_bool
+cl_event_is_gpu_command_type(cl_command_type type)
+{
+  switch(type) {
+    case CL_COMMAND_COPY_BUFFER:
+    case CL_COMMAND_FILL_BUFFER:
+    case CL_COMMAND_COPY_IMAGE:
+    case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
+    case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
+    case CL_COMMAND_COPY_BUFFER_RECT:
+    case CL_COMMAND_TASK:
+    case CL_COMMAND_NDRANGE_KERNEL:
+      return CL_TRUE;
+    default:
+      return CL_FALSE;
+  }
+}
+
+void cl_event_flush(cl_event event)
+{
+  assert(event->gpgpu_event != NULL);
+  if (event->gpgpu) {
+    cl_command_queue_flush_gpgpu(event->queue, event->gpgpu);
+    cl_gpgpu_delete(event->gpgpu);
+    event->gpgpu = NULL;
+  }
+  cl_gpgpu_event_flush(event->gpgpu_event);
+  event->queue->last_event = event;
+}
+
+cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict)
+{
+  cl_event event = NULL;
+  GET_QUEUE_THREAD_GPGPU(queue);
+
+  /* Allocate and inialize the structure itself */
+  TRY_ALLOC_NO_ERR (event, CALLOC(struct _cl_event));
+  SET_ICD(event->dispatch)
+  event->magic = CL_MAGIC_EVENT_HEADER;
+  event->ref_n = 1;
+
+  /* Append the event in the context event list */
+  pthread_mutex_lock(&ctx->event_lock);
+    event->next = ctx->events;
+    if (ctx->events != NULL)
+      ctx->events->prev = event;
+    ctx->events = event;
+  pthread_mutex_unlock(&ctx->event_lock);
+  event->ctx   = ctx;
+  cl_context_add_ref(ctx);
+
+  /* Initialize all members and create GPGPU event object */
+  event->queue = queue;
+  event->type  = type;
+  event->gpgpu_event = NULL;
+  if(type == CL_COMMAND_USER) {
+    event->status = CL_SUBMITTED;
+  }
+  else {
+    event->status = CL_QUEUED;
+    if(cl_event_is_gpu_command_type(event->type))
+      event->gpgpu_event = cl_gpgpu_event_new(gpgpu);
+  }
+  cl_event_add_ref(event);       //dec when complete
+  event->user_cb = NULL;
+  event->enqueue_cb = NULL;
+  event->waits_head = NULL;
+  event->emplict = emplict;
+
+exit:
+  return event;
+error:
+  cl_event_delete(event);
+  event = NULL;
+  goto exit;
+}
+
+void cl_event_delete(cl_event event)
+{
+  if (UNLIKELY(event == NULL))
+    return;
+
+  cl_event_update_status(event, 0);
+
+  if (atomic_dec(&event->ref_n) > 1)
+    return;
+
+  if(event->queue && event->queue->last_event == event)
+    event->queue->last_event = NULL;
+
+  /* Call all user's callback if haven't execute */
+  user_callback *cb = event->user_cb;
+  while(event->user_cb) {
+    cb = event->user_cb;
+    if(cb->executed == CL_FALSE) {
+      cb->executed = CL_TRUE;
+      cb->pfn_notify(event, event->status, cb->user_data);
+    }
+    event->user_cb = cb->next;
+    cl_free(cb);
+  }
+
+  /* delete gpgpu event object */
+  if(event->gpgpu_event)
+    cl_gpgpu_event_delete(event->gpgpu_event);
+
+  /* Remove it from the list */
+  assert(event->ctx);
+  pthread_mutex_lock(&event->ctx->event_lock);
+
+  if (event->prev)
+    event->prev->next = event->next;
+  if (event->next)
+    event->next->prev = event->prev;
+  /* if this is the head, update head pointer ctx->events */
+  if (event->ctx->events == event)
+    event->ctx->events = event->next;
+
+  pthread_mutex_unlock(&event->ctx->event_lock);
+  cl_context_delete(event->ctx);
+
+  if (event->gpgpu) {
+    fprintf(stderr, "Warning: a event is deleted with a pending enqueued task.\n");
+    cl_gpgpu_delete(event->gpgpu);
+    event->gpgpu = NULL;
+  }
+  cl_free(event);
+}
+
+void cl_event_add_ref(cl_event event)
+{
+  assert(event);
+  atomic_inc(&event->ref_n);
+}
+
+cl_int cl_event_set_callback(cl_event event ,
+                                  cl_int command_exec_callback_type,
+                                  EVENT_NOTIFY pfn_notify,
+                                  void* user_data)
+{
+  assert(event);
+  assert(pfn_notify);
+
+  cl_int err = CL_SUCCESS;
+  user_callback *cb;
+  TRY_ALLOC(cb, CALLOC(user_callback));
+
+  cb->pfn_notify  = pfn_notify;
+  cb->user_data   = user_data;
+  cb->status      = command_exec_callback_type;
+  cb->executed    = CL_FALSE;
+
+  cb->next        = event->user_cb;
+  event->user_cb  = cb;
+
+exit:
+  return err;
+error:
+  err = CL_OUT_OF_HOST_MEMORY;
+  cl_free(cb);
+  goto exit;
+};
+
+cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list,
+                                    const cl_event *event_wait_list,
+                                    cl_event *event,cl_context ctx)
+{
+  cl_int err = CL_SUCCESS;
+  cl_int i;
+  /* check the event_wait_list and num_events_in_wait_list */
+  if((event_wait_list == NULL) &&
+     (num_events_in_wait_list > 0))
+    goto error;
+
+  if ((event_wait_list != NULL) &&
+      (num_events_in_wait_list == 0)){
+    goto error;
+  }
+
+  /* check the event and context */
+  for(i=0; i<num_events_in_wait_list; i++) {
+    CHECK_EVENT(event_wait_list[i]);
+    if(event_wait_list[i]->status < CL_COMPLETE) {
+      err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+      goto exit;
+    }
+    if(event && *event == event_wait_list[i])
+      goto error;
+    if(event_wait_list[i]->ctx != ctx)
+      goto error;
+  }
+
+exit:
+  return err;
+error:
+  err = CL_INVALID_EVENT_WAIT_LIST;  //reset error
+  goto exit;
+}
+
+cl_int cl_event_wait_events(cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+                            cl_command_queue queue)
+{
+  cl_int i;
+
+  /* Check whether wait user events */
+  for(i=0; i<num_events_in_wait_list; i++) {
+    if(event_wait_list[i]->status <= CL_COMPLETE)
+      continue;
+
+    /* Need wait on user event, return and do enqueue defer */
+    if((event_wait_list[i]->type == CL_COMMAND_USER) ||
+       (event_wait_list[i]->enqueue_cb &&
+       (event_wait_list[i]->enqueue_cb->wait_user_events != NULL))){
+      return CL_ENQUEUE_EXECUTE_DEFER;
+    }
+  }
+
+  if(queue && queue->barrier_events_num )
+      return CL_ENQUEUE_EXECUTE_DEFER;
+
+  /* Non user events or all user event finished, wait all enqueue events finish */
+  for(i=0; i<num_events_in_wait_list; i++) {
+    if(event_wait_list[i]->status <= CL_COMPLETE)
+      continue;
+
+    //enqueue callback haven't finish, in another thread, wait
+    if(event_wait_list[i]->enqueue_cb != NULL)
+      return CL_ENQUEUE_EXECUTE_DEFER;
+    if(event_wait_list[i]->gpgpu_event)
+      cl_gpgpu_event_update_status(event_wait_list[i]->gpgpu_event, 1);
+    cl_event_set_status(event_wait_list[i], CL_COMPLETE);  //Execute user's callback
+  }
+  return CL_ENQUEUE_EXECUTE_IMM;
+}
+
+void cl_event_new_enqueue_callback(cl_event event,
+                                            enqueue_data *data,
+                                            cl_uint num_events_in_wait_list,
+                                            const cl_event *event_wait_list)
+{
+  enqueue_callback *cb, *node;
+  user_event *user_events, *u_ev;
+  cl_command_queue queue = event->queue;
+  cl_int i;
+  cl_int err = CL_SUCCESS;
+
+  /* Allocate and initialize the structure itself */
+  TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
+  cb->num_events = 0;
+  TRY_ALLOC_NO_ERR (cb->wait_list, CALLOC_ARRAY(cl_event, num_events_in_wait_list));
+  for(i=0; i<num_events_in_wait_list; i++) {
+    //user event will insert to cb->wait_user_events, need not in wait list, avoid ref twice
+    if(event_wait_list[i]->type != CL_COMMAND_USER) {
+      cb->wait_list[cb->num_events++] = event_wait_list[i];
+      cl_event_add_ref(event_wait_list[i]);  //add defer enqueue's wait event reference
+    }
+  }
+  cb->event = event;
+  cb->next = NULL;
+  cb->wait_user_events = NULL;
+
+  if(queue && queue->barrier_events_num > 0) {
+    for(i=0; i<queue->barrier_events_num; i++) {
+      /* Insert the enqueue_callback to user event list */
+      node = queue->wait_events[i]->waits_head;
+      if(node == NULL)
+        queue->wait_events[i]->waits_head = cb;
+      else{
+        while((node != cb) && node->next)
+          node = node->next;
+        if(node == cb)   //wait on dup user event
+          continue;
+        node->next = cb;
+      }
+
+      /* Insert the user event to enqueue_callback's wait_user_events */
+      TRY(cl_event_insert_user_event, &cb->wait_user_events, queue->wait_events[i]);
+      cl_event_add_ref(queue->wait_events[i]);
+    }
+  }
+
+  /* Find out all user events that in event_wait_list wait */
+  for(i=0; i<num_events_in_wait_list; i++) {
+    if(event_wait_list[i]->status <= CL_COMPLETE)
+      continue;
+
+    if(event_wait_list[i]->type == CL_COMMAND_USER) {
+      /* Insert the enqueue_callback to user event list */
+      node = event_wait_list[i]->waits_head;
+      if(node == NULL)
+        event_wait_list[i]->waits_head = cb;
+      else {
+        while((node != cb) && node->next)
+          node = node->next;
+        if(node == cb)   //wait on dup user event
+          continue;
+        node->next = cb;
+      }
+      /* Insert the user event to enqueue_callback's wait_user_events */
+      TRY(cl_event_insert_user_event, &cb->wait_user_events, event_wait_list[i]);
+      cl_event_add_ref(event_wait_list[i]);
+      cl_command_queue_insert_event(event->queue, event_wait_list[i]);
+      if(data->type == EnqueueBarrier){
+        cl_command_queue_insert_barrier_event(event->queue, event_wait_list[i]);
+      }
+    } else if(event_wait_list[i]->enqueue_cb != NULL) {
+      user_events = event_wait_list[i]->enqueue_cb->wait_user_events;
+      while(user_events != NULL) {
+        /* Insert the enqueue_callback to user event's  waits_tail */
+        node = user_events->event->waits_head;
+        if(node == NULL)
+          event_wait_list[i]->waits_head = cb;
+        else{
+          while((node != cb) && node->next)
+            node = node->next;
+          if(node == cb) {  //wait on dup user event
+            user_events = user_events->next;
+            continue;
+          }
+          node->next = cb;
+        }
+
+        /* Insert the user event to enqueue_callback's wait_user_events */
+        TRY(cl_event_insert_user_event, &cb->wait_user_events, user_events->event);
+        cl_event_add_ref(user_events->event);
+        cl_command_queue_insert_event(event->queue, user_events->event);
+        if(data->type == EnqueueBarrier){
+          cl_command_queue_insert_barrier_event(event->queue, user_events->event);
+        }
+        user_events = user_events->next;
+      }
+    }
+  }
+  if(data->queue != NULL && event->gpgpu_event != NULL) {
+    event->gpgpu = cl_thread_gpgpu_take(event->queue);
+    data->ptr = (void *)event->gpgpu_event;
+  }
+  cb->data = *data;
+  event->enqueue_cb = cb;
+
+exit:
+  return;
+error:
+  if(cb) {
+    while(cb->wait_user_events) {
+      u_ev = cb->wait_user_events;
+      cb->wait_user_events = cb->wait_user_events->next;
+      cl_event_delete(u_ev->event);
+      cl_free(u_ev);
+    }
+    for(i=0; i<cb->num_events; i++) {
+      if(cb->wait_list[i]) {
+        cl_event_delete(cb->wait_list[i]);
+      }
+    }
+    cl_free(cb);
+  }
+  goto exit;
+}
+
+void cl_event_set_status(cl_event event, cl_int status)
+{
+  user_callback *user_cb;
+  cl_int ret, i;
+  cl_event evt;
+
+  pthread_mutex_lock(&event->ctx->event_lock);
+  if(status >= event->status) {
+    pthread_mutex_unlock(&event->ctx->event_lock);
+    return;
+  }
+  if(event->status <= CL_COMPLETE) {
+    event->status = status;    //have done enqueue before or doing in another thread
+    pthread_mutex_unlock(&event->ctx->event_lock);
+    return;
+  }
+
+  if(status <= CL_COMPLETE) {
+    if(event->enqueue_cb) {
+      if(status == CL_COMPLETE) {
+        cl_enqueue_handle(event, &event->enqueue_cb->data);
+        if(event->gpgpu_event)
+          cl_gpgpu_event_update_status(event->gpgpu_event, 1);  //now set complet, need refine
+      } else {
+        if(event->gpgpu_event) {
+          // Error then cancel the enqueued event.
+          cl_gpgpu_delete(event->gpgpu);
+          event->gpgpu = NULL;
+        }
+      }
+
+      event->status = status;  //Change the event status after enqueue and befor unlock
+
+      pthread_mutex_unlock(&event->ctx->event_lock);
+      for(i=0; i<event->enqueue_cb->num_events; i++)
+        cl_event_delete(event->enqueue_cb->wait_list[i]);
+      pthread_mutex_lock(&event->ctx->event_lock);
+
+      if(event->enqueue_cb->wait_list)
+        cl_free(event->enqueue_cb->wait_list);
+      cl_free(event->enqueue_cb);
+      event->enqueue_cb = NULL;
+    }
+  }
+  if(event->status >= status)  //maybe changed in other threads
+    event->status = status;
+  pthread_mutex_unlock(&event->ctx->event_lock);
+
+  if(event->status <= CL_COMPLETE)
+    cl_event_delete(event);
+
+  /* Call user callback */
+  user_cb = event->user_cb;
+  while(user_cb) {
+    if(user_cb->status >= status) {
+      user_cb->executed = CL_TRUE;
+      user_cb->pfn_notify(event, event->status, user_cb->user_data);
+    }
+    user_cb = user_cb->next;
+  }
+
+  if(event->type != CL_COMMAND_USER)
+    return;
+
+  /* Check all defer enqueue */
+  enqueue_callback *cb, *enqueue_cb = event->waits_head;
+  while(enqueue_cb) {
+    /* Remove this user event in enqueue_cb, update the header if needed. */
+    cl_event_remove_user_event(&enqueue_cb->wait_user_events, event);
+    cl_event_delete(event);
+
+    /* Still wait on other user events */
+    if(enqueue_cb->wait_user_events != NULL) {
+      enqueue_cb = enqueue_cb->next;
+      continue;
+    }
+
+    //remove user event frome enqueue_cb's ctx
+    cl_command_queue_remove_event(enqueue_cb->event->queue, event);
+    cl_command_queue_remove_barrier_event(enqueue_cb->event->queue, event);
+
+    /* All user events complete, now wait enqueue events */
+    ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list,
+        enqueue_cb->event->queue);
+    ret = ret;
+    assert(ret != CL_ENQUEUE_EXECUTE_DEFER);
+
+    cb = enqueue_cb;
+    enqueue_cb = enqueue_cb->next;
+
+    /* Call the pending operation */
+    evt = cb->event;
+    /* TODO: if this event wait on several events, one event's
+       status is error, the others is complete, what's the status
+       of this event? Can't find the description in OpenCL spec.
+       Simply update to latest finish wait event.*/
+    cl_event_set_status(cb->event, status);
+    if(evt->emplict == CL_FALSE) {
+      cl_event_delete(evt);
+    }
+  }
+  event->waits_head = NULL;
+}
+
+void cl_event_update_status(cl_event event, int wait)
+{
+  if(event->status <= CL_COMPLETE)
+    return;
+  if((event->gpgpu_event) &&
+     (cl_gpgpu_event_update_status(event->gpgpu_event, wait) == command_complete))
+    cl_event_set_status(event, CL_COMPLETE);
+}
+
+cl_int cl_event_marker_with_wait_list(cl_command_queue queue,
+                cl_uint num_events_in_wait_list,
+                const cl_event *event_wait_list,
+                cl_event* event)
+{
+  enqueue_data data = { 0 };
+  cl_event e;
+
+  e = cl_event_new(queue->ctx, queue, CL_COMMAND_MARKER, CL_TRUE);
+  if(e == NULL)
+    return CL_OUT_OF_HOST_MEMORY;
+
+  if(event != NULL ){
+    *event = e;
+  }
+
+//enqueues a marker command which waits for either a list of events to complete, or if the list is
+//empty it waits for all commands previously enqueued in command_queue to complete before it  completes.
+  if(num_events_in_wait_list > 0){
+    if(cl_event_wait_events(num_events_in_wait_list, event_wait_list, queue) == CL_ENQUEUE_EXECUTE_DEFER) {
+      data.type = EnqueueMarker;
+      cl_event_new_enqueue_callback(*event, &data, num_events_in_wait_list, event_wait_list);
+      return CL_SUCCESS;
+    }
+  } else if(queue->wait_events_num > 0) {
+    data.type = EnqueueMarker;
+    cl_event_new_enqueue_callback(*event, &data, queue->wait_events_num, queue->wait_events);
+    return CL_SUCCESS;
+  }
+
+  if(queue->last_event && queue->last_event->gpgpu_event) {
+    cl_gpgpu_event_update_status(queue->last_event->gpgpu_event, 1);
+  }
+
+  cl_event_set_status(e, CL_COMPLETE);
+  return CL_SUCCESS;
+}
+
+cl_int cl_event_barrier_with_wait_list(cl_command_queue queue,
+                cl_uint num_events_in_wait_list,
+                const cl_event *event_wait_list,
+                cl_event* event)
+{
+  enqueue_data data = { 0 };
+  cl_event e;
+
+  e = cl_event_new(queue->ctx, queue, CL_COMMAND_BARRIER, CL_TRUE);
+  if(e == NULL)
+    return CL_OUT_OF_HOST_MEMORY;
+
+  if(event != NULL ){
+    *event = e;
+  }
+//enqueues a barrier command which waits for either a list of events to complete, or if the list is
+//empty it waits for all commands previously enqueued in command_queue to complete before it  completes.
+  if(num_events_in_wait_list > 0){
+    if(cl_event_wait_events(num_events_in_wait_list, event_wait_list, queue) == CL_ENQUEUE_EXECUTE_DEFER) {
+      data.type = EnqueueBarrier;
+      cl_event_new_enqueue_callback(e, &data, num_events_in_wait_list, event_wait_list);
+      return CL_SUCCESS;
+    }
+  } else if(queue->wait_events_num > 0) {
+    data.type = EnqueueBarrier;
+    cl_event_new_enqueue_callback(e, &data, queue->wait_events_num, queue->wait_events);
+    return CL_SUCCESS;
+  }
+
+  if(queue->last_event && queue->last_event->gpgpu_event) {
+    cl_gpgpu_event_update_status(queue->last_event->gpgpu_event, 1);
+  }
+
+  cl_event_set_status(e, CL_COMPLETE);
+  return CL_SUCCESS;
+}
+
+cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name)
+{
+  cl_ulong ret_val = 0;
+  GET_QUEUE_THREAD_GPGPU(event->queue);
+
+  if (!event->gpgpu_event) {
+    cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
+    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
+    return CL_SUCCESS;
+  }
+
+  if(param_name == CL_PROFILING_COMMAND_SUBMIT ||
+         param_name == CL_PROFILING_COMMAND_QUEUED) {
+    cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
+    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
+    return CL_SUCCESS;
+  } else if(param_name == CL_PROFILING_COMMAND_START) {
+    cl_gpgpu_event_get_exec_timestamp(gpgpu, event->gpgpu_event, 0, &ret_val);
+    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
+    return CL_SUCCESS;
+  } else if (param_name == CL_PROFILING_COMMAND_END) {
+    cl_gpgpu_event_get_exec_timestamp(gpgpu, event->gpgpu_event, 1, &ret_val);
+    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val;
+    return CL_SUCCESS;
+  }
+  return CL_INVALID_VALUE;
+}
+
+cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event)
+{
+  user_event * u_iter = *p_u_ev;
+  user_event * u_ev;
+
+  while(u_iter)
+  {
+    if(u_iter->event == event)
+      return CL_SUCCESS;
+    u_iter = u_iter->next;
+  }
+
+  TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
+  u_ev->event = event;
+  u_ev->next = *p_u_ev;
+  *p_u_ev = u_ev;
+
+
+  return CL_SUCCESS;
+error:
+  return CL_FALSE;
+}
+
+cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event)
+{
+  user_event * u_iter = *p_u_ev;
+  user_event * u_prev = *p_u_ev;
+
+  while(u_iter){
+    if(u_iter->event == event ){
+      if(u_iter == *p_u_ev){
+        *p_u_ev = u_iter->next;
+      }else{
+        u_prev->next = u_iter->next;
+      }
+      cl_free(u_iter);
+      break;
+    }
+    u_prev = u_iter;
+    u_iter = u_iter->next;
+  }
+
+  return CL_SUCCESS;
+}
diff --git a/src/cl_event.h b/src/cl_event.h
new file mode 100644
index 0000000..cfe5ddd
--- /dev/null
+++ b/src/cl_event.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_EVENT_H__
+#define __CL_EVENT_H__
+
+#include <semaphore.h>
+
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "cl_enqueue.h"
+#include "CL/cl.h"
+
+#define CL_ENQUEUE_EXECUTE_IMM   0
+#define CL_ENQUEUE_EXECUTE_DEFER 1
+
+typedef struct _user_event {
+  cl_event            event;   /* The user event */
+  struct _user_event* next;    /* Next user event in list */
+} user_event;
+
+typedef struct _enqueue_callback {
+  cl_event           event;            /* The event relative this enqueue callback */
+  enqueue_data       data;             /* Hold all enqueue callback's infomation */
+  cl_uint            num_events;       /* num events in wait list */
+  cl_event*          wait_list;        /* All event wait list this callback wait on */
+  user_event*        wait_user_events; /* The head of user event list the callback wait on */
+  struct _enqueue_callback*  next;     /* The  next enqueue callback in wait list */
+} enqueue_callback;
+
+typedef void (CL_CALLBACK *EVENT_NOTIFY)(cl_event event, cl_int event_command_exec_status, void *user_data);
+
+typedef struct _user_callback {
+  cl_int            status;     /* The execution status */
+  cl_bool           executed;   /* Indicat the callback function been called or not */
+  EVENT_NOTIFY      pfn_notify; /* Callback function */
+  void*             user_data;  /* Callback user data */
+  struct _user_callback*    next;       /* Next event callback in list */
+} user_callback;
+
+struct _cl_event {
+  DEFINE_ICD(dispatch)
+  uint64_t           magic;       /* To identify it as a sampler object */
+  volatile int       ref_n;       /* We reference count this object */
+  cl_context         ctx;         /* The context associated with event */
+  cl_event           prev, next;  /* We chain the memory buffers together */
+  cl_command_queue   queue;       /* The command queue associated with event */
+  cl_command_type    type;        /* The command type associated with event */
+  cl_int             status;      /* The execution status */
+  cl_gpgpu           gpgpu;       /* Current gpgpu, owned by this structure. */
+  cl_gpgpu_event     gpgpu_event; /* The event object communicate with hardware */
+  user_callback*     user_cb;     /* The event callback functions */
+  enqueue_callback*  enqueue_cb;  /* This event's enqueue */
+  enqueue_callback*  waits_head;  /* The head of enqueues list wait on this event */
+  cl_bool            emplict;     /* Identify this event whether created by api emplict*/
+  cl_ulong           timestamp[4];/* The time stamps for profiling. */
+};
+
+/* Create a new event object */
+cl_event cl_event_new(cl_context, cl_command_queue, cl_command_type, cl_bool);
+/* Unref the object and delete it if no more reference on it */
+void cl_event_delete(cl_event);
+/* Add one more reference to this object */
+void cl_event_add_ref(cl_event);
+/* Rigister a user callback function for specific commond execution status */
+cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
+/* Check events wait list for enqueue commonds */
+cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *, cl_context);
+/* Wait the all events in wait list complete */
+cl_int cl_event_wait_events(cl_uint, const cl_event *, cl_command_queue);
+/* New a enqueue suspend task */
+void cl_event_new_enqueue_callback(cl_event, enqueue_data *, cl_uint, const cl_event *);
+/* Set the event status and call all callbacks */
+void cl_event_set_status(cl_event, cl_int);
+/* Check and update event status */
+void cl_event_update_status(cl_event, cl_int);
+/* Create the marker event */
+cl_int cl_event_marker_with_wait_list(cl_command_queue, cl_uint, const cl_event *,  cl_event*);
+/* Create the barrier event */
+cl_int cl_event_barrier_with_wait_list(cl_command_queue, cl_uint, const cl_event *,  cl_event*);
+/* Do the event profiling */
+cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name);
+/* insert the user event */
+cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event);
+/* remove the user event */
+cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event);
+/* flush the event's pending gpgpu batch buffer and notify driver this gpgpu event has been flushed. */
+void cl_event_flush(cl_event event);
+#endif /* __CL_EVENT_H__ */
+
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
new file mode 100644
index 0000000..d07a525
--- /dev/null
+++ b/src/cl_extensions.c
@@ -0,0 +1,107 @@
+#ifdef HAS_EGL
+#include "EGL/egl.h"
+#include "EGL/eglext.h"
+#endif
+
+#include "cl_platform_id.h"
+#include "cl_internals.h"
+#include "CL/cl.h"
+#include "cl_utils.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+static struct cl_extensions intel_extensions =
+{
+  {
+#define DECL_EXT(name) \
+  {(struct cl_extension_base){.ext_id = cl_##name##_ext_id, .ext_name = "cl_" #name, .ext_enabled = 0}},
+  DECL_ALL_EXTENSIONS
+  },
+#undef DECL_EXT
+  {""}
+};
+
+void check_basic_extension(cl_extensions_t *extensions)
+{
+  int id;
+  for(id = BASE_EXT_START_ID; id <= BASE_EXT_END_ID; id++)
+    if (id != EXT_ID(khr_fp64))
+      extensions->extensions[id].base.ext_enabled = 1;
+}
+
+void check_opt1_extension(cl_extensions_t *extensions)
+{
+  int id;
+  for(id = OPT1_EXT_START_ID; id <= OPT1_EXT_END_ID; id++)
+    if (id == EXT_ID(khr_icd))
+      extensions->extensions[id].base.ext_enabled = 1;
+}
+
+void
+check_gl_extension(cl_extensions_t *extensions) {
+#if defined(HAS_EGL)
+  int id;
+      /* For now, we only support cl_khr_gl_sharing. */
+  for(id = GL_EXT_START_ID; id <= GL_EXT_END_ID; id++)
+    if (id == EXT_ID(khr_gl_sharing))
+      extensions->extensions[id].base.ext_enabled = 1;
+#endif
+}
+
+void
+check_intel_extension(cl_extensions_t *extensions)
+{
+  /* Should put those map/unmap extensions here. */
+}
+
+void
+process_extension_str(cl_extensions_t *extensions)
+{
+  int str_max = sizeof(extensions->ext_str);
+  int str_offset = 0;
+  int id;
+
+  extensions->ext_str[str_max] = '\0';
+
+  for(id = 0; id < cl_khr_extension_id_max; id++)
+  {
+    if (extensions->extensions[id].base.ext_enabled) {
+      int copy_len;
+      char *ext_name = extensions->extensions[id].base.ext_name;
+      if (str_offset + 1 >= str_max)
+        return;
+
+      if (str_offset != 0)
+        extensions->ext_str[str_offset - 1] = ' ';
+      copy_len = (strlen(ext_name) + 1 + str_offset) < str_max
+                 ? (strlen(ext_name) + 1) : (str_max - str_offset - 1);
+      strncpy(&extensions->ext_str[str_offset],
+              extensions->extensions[id].base.ext_name, copy_len);
+      str_offset += copy_len;
+    }
+  }
+}
+
+LOCAL void
+cl_intel_platform_extension_init(cl_platform_id intel_platform)
+{
+  static int initialized = 0;
+
+  if (initialized) {
+    intel_platform->internal_extensions = &intel_extensions;
+    intel_platform->extensions = intel_extensions.ext_str;
+    return;
+  }
+  check_basic_extension(&intel_extensions);
+  check_opt1_extension(&intel_extensions);
+  check_gl_extension(&intel_extensions);
+  check_intel_extension(&intel_extensions);
+  process_extension_str(&intel_extensions);
+
+  intel_platform->internal_extensions = &intel_extensions;
+  intel_platform->extensions = intel_extensions.ext_str;
+
+  initialized = 1;
+  return;
+}
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
new file mode 100644
index 0000000..52ee0a4
--- /dev/null
+++ b/src/cl_extensions.h
@@ -0,0 +1,99 @@
+/* The following approved Khronos extension
+ * names must be returned by all device that
+ * support OpenCL C 1.2. */
+#define DECL_BASE_EXTENSIONS \
+  DECL_EXT(khr_global_int32_base_atomics) \
+  DECL_EXT(khr_global_int32_extended_atomics) \
+  DECL_EXT(khr_local_int32_base_atomics) \
+  DECL_EXT(khr_local_int32_extended_atomics) \
+  DECL_EXT(khr_byte_addressable_store) \
+  DECL_EXT(khr_fp64)
+
+/* The OPT1 extensions are those optional extensions
+ * which don't have external dependecies*/
+#define DECL_OPT1_EXTENSIONS \
+  DECL_EXT(khr_int64_base_atomics)\
+  DECL_EXT(khr_int64_extended_atomics)\
+  DECL_EXT(khr_3d_image_writes)\
+  DECL_EXT(khr_fp16)\
+  DECL_EXT(khr_image2d_from_buffer)\
+  DECL_EXT(khr_initialize_memory)\
+  DECL_EXT(khr_context_abort)\
+  DECL_EXT(khr_depth_images)\
+  DECL_EXT(khr_spir) \
+  DECL_EXT(khr_icd)
+
+#define DECL_GL_EXTENSIONS \
+  DECL_EXT(khr_gl_sharing)\
+  DECL_EXT(khr_gl_event)\
+  DECL_EXT(khr_gl_depth_images)\
+  DECL_EXT(khr_gl_msaa_sharing)
+
+#define DECL_D3D_EXTENSIONS \
+  DECL_EXT(khr_d3d10_sharing)\
+  DECL_EXT(khr_dx9_media_sharing)\
+  DECL_EXT(khr_d3d11_sharing)\
+
+#define DECL_ALL_EXTENSIONS \
+  DECL_BASE_EXTENSIONS \
+  DECL_OPT1_EXTENSIONS \
+  DECL_GL_EXTENSIONS \
+  DECL_D3D_EXTENSIONS
+
+#define EXT_ID(name) cl_ ## name ## _ext_id
+#define EXT_STRUCT_NAME(name) cl_ ## name ## ext
+/*Declare enum ids */
+typedef enum {
+#define DECL_EXT(name) EXT_ID(name),
+DECL_ALL_EXTENSIONS
+#undef DECL_EXT
+cl_khr_extension_id_max
+}cl_extension_enum;
+
+#define BASE_EXT_START_ID EXT_ID(khr_global_int32_base_atomics)
+#define BASE_EXT_END_ID EXT_ID(khr_fp64)
+#define OPT1_EXT_START_ID EXT_ID(khr_int64_base_atomics)
+#define OPT1_EXT_END_ID EXT_ID(khr_icd)
+#define GL_EXT_START_ID EXT_ID(khr_gl_sharing)
+#define GL_EXT_END_ID EXT_ID(khr_gl_msaa_sharing)
+
+#define IS_BASE_EXTENSION(id)  (id >= BASE_EXT_START_ID && id <= BASE_EXT_END_ID)
+#define IS_OPT1_EXTENSION(id)  (id >= OPT1_EXT_START_ID && id <= OPT1_EXT_END_ID)
+#define IS_GL_EXTENSION(id)    (id >= GL_EXT_START_ID && id <= GL_EXT_END_ID)
+
+struct cl_extension_base {
+  cl_extension_enum ext_id;
+  int  ext_enabled;
+  char *ext_name;
+};
+
+/* Declare each extension structure. */
+#define DECL_EXT(name) \
+struct EXT_STRUCT_NAME(name) { \
+  struct cl_extension_base base;\
+};
+
+DECL_BASE_EXTENSIONS
+DECL_OPT1_EXTENSIONS
+DECL_D3D_EXTENSIONS
+DECL_GL_EXTENSIONS
+#undef DECL_EXT
+
+/* Union all extensions together. */
+typedef union {
+  struct cl_extension_base base;
+  #define DECL_EXT(name) struct EXT_STRUCT_NAME(name) EXT_STRUCT_NAME(name);
+  DECL_ALL_EXTENSIONS
+  #undef DECL_EXT
+} extension_union;
+
+typedef struct cl_extensions {
+  extension_union extensions[cl_khr_extension_id_max];
+  char ext_str[256];
+} cl_extensions_t;
+
+struct _cl_platform_id;
+typedef struct _cl_platform_id * cl_platform_id;
+
+extern void
+cl_intel_platform_extension_init(cl_platform_id intel_platform);
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
new file mode 100644
index 0000000..7da0475
--- /dev/null
+++ b/src/cl_gbe_loader.cpp
@@ -0,0 +1,328 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include <iostream>
+#include <dlfcn.h>
+#include <string.h>
+#include <stdio.h>
+#include "cl_gbe_loader.h"
+#include "backend/src/GBEConfig.h"
+
+//function pointer from libgbe.so
+gbe_program_new_from_source_cb *compiler_program_new_from_source = NULL;
+gbe_program_compile_from_source_cb *compiler_program_compile_from_source = NULL;
+gbe_program_new_gen_program_cb *compiler_program_new_gen_program = NULL;
+gbe_program_link_program_cb *compiler_program_link_program = NULL;
+gbe_program_build_from_llvm_cb *compiler_program_build_from_llvm = NULL;
+gbe_program_new_from_llvm_binary_cb *compiler_program_new_from_llvm_binary = NULL;
+gbe_program_serialize_to_binary_cb *compiler_program_serialize_to_binary = NULL;
+gbe_program_new_from_llvm_cb *compiler_program_new_from_llvm = NULL;
+gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource = NULL;
+
+//function pointer from libgbeinterp.so
+gbe_program_new_from_binary_cb *interp_program_new_from_binary = NULL;
+gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size = NULL;
+gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data = NULL;
+gbe_program_delete_cb *interp_program_delete = NULL;
+gbe_program_get_kernel_num_cb *interp_program_get_kernel_num = NULL;
+gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name = NULL;
+gbe_program_get_kernel_cb *interp_program_get_kernel = NULL;
+gbe_kernel_get_name_cb *interp_kernel_get_name = NULL;
+gbe_kernel_get_attributes_cb *interp_kernel_get_attributes = NULL;
+gbe_kernel_get_code_cb *interp_kernel_get_code = NULL;
+gbe_kernel_get_code_size_cb *interp_kernel_get_code_size = NULL;
+gbe_kernel_get_arg_num_cb *interp_kernel_get_arg_num = NULL;
+gbe_kernel_get_arg_size_cb *interp_kernel_get_arg_size = NULL;
+gbe_kernel_get_arg_bti_cb *interp_kernel_get_arg_bti = NULL;
+gbe_kernel_get_arg_type_cb *interp_kernel_get_arg_type = NULL;
+gbe_kernel_get_arg_align_cb *interp_kernel_get_arg_align = NULL;
+gbe_kernel_get_simd_width_cb *interp_kernel_get_simd_width = NULL;
+gbe_kernel_get_curbe_offset_cb *interp_kernel_get_curbe_offset = NULL;
+gbe_kernel_get_curbe_size_cb *interp_kernel_get_curbe_size = NULL;
+gbe_kernel_get_stack_size_cb *interp_kernel_get_stack_size = NULL;
+gbe_kernel_get_scratch_size_cb *interp_kernel_get_scratch_size = NULL;
+gbe_kernel_get_required_work_group_size_cb *interp_kernel_get_required_work_group_size = NULL;
+gbe_kernel_use_slm_cb *interp_kernel_use_slm = NULL;
+gbe_kernel_get_slm_size_cb *interp_kernel_get_slm_size = NULL;
+gbe_kernel_get_sampler_size_cb *interp_kernel_get_sampler_size = NULL;
+gbe_kernel_get_sampler_data_cb *interp_kernel_get_sampler_data = NULL;
+gbe_kernel_get_compile_wg_size_cb *interp_kernel_get_compile_wg_size = NULL;
+gbe_kernel_get_image_size_cb *interp_kernel_get_image_size = NULL;
+gbe_kernel_get_image_data_cb *interp_kernel_get_image_data = NULL;
+gbe_get_printf_num_cb* interp_get_printf_num = NULL;
+gbe_get_printf_buf_bti_cb* interp_get_printf_buf_bti = NULL;
+gbe_get_printf_indexbuf_bti_cb* interp_get_printf_indexbuf_bti = NULL;
+gbe_dup_printfset_cb* interp_dup_printfset = NULL;
+gbe_get_printf_sizeof_size_cb* interp_get_printf_sizeof_size = NULL;
+gbe_release_printf_info_cb* interp_release_printf_info = NULL;
+gbe_output_printf_cb* interp_output_printf = NULL;
+gbe_kernel_get_arg_info_cb *interp_kernel_get_arg_info = NULL;
+
+struct GbeLoaderInitializer
+{
+  GbeLoaderInitializer()
+  {
+    LoadCompiler();
+
+    const char* path;
+    if (!LoadInterp(path))
+      std::cerr << "unable to load " << path << " which is part of the driver, please check!" << std::endl;
+  }
+
+  bool LoadInterp(const char*& path)
+  {
+    const char* interpPath = getenv("OCL_INTERP_PATH");
+    if (interpPath == NULL)
+      interpPath = INTERP_OBJECT_DIR;
+
+    path = interpPath;
+
+    dlhInterp = dlopen(interpPath, RTLD_LAZY | RTLD_LOCAL);
+    if (dlhInterp == NULL) {
+      return false;
+    }
+
+    interp_program_new_from_binary = *(gbe_program_new_from_binary_cb**)dlsym(dlhInterp, "gbe_program_new_from_binary");
+    if (interp_program_new_from_binary == NULL)
+      return false;
+
+    interp_program_get_global_constant_size = *(gbe_program_get_global_constant_size_cb**)dlsym(dlhInterp, "gbe_program_get_global_constant_size");
+    if (interp_program_get_global_constant_size == NULL)
+      return false;
+
+    interp_program_get_global_constant_data = *(gbe_program_get_global_constant_data_cb**)dlsym(dlhInterp, "gbe_program_get_global_constant_data");
+    if (interp_program_get_global_constant_data == NULL)
+      return false;
+
+    interp_program_delete = *(gbe_program_delete_cb**)dlsym(dlhInterp, "gbe_program_delete");
+    if (interp_program_delete == NULL)
+      return false;
+
+    interp_program_get_kernel_num = *(gbe_program_get_kernel_num_cb**)dlsym(dlhInterp, "gbe_program_get_kernel_num");
+    if (interp_program_get_kernel_num == NULL)
+      return false;
+
+    interp_program_get_kernel_by_name = *(gbe_program_get_kernel_by_name_cb**)dlsym(dlhInterp, "gbe_program_get_kernel_by_name");
+    if (interp_program_get_kernel_by_name == NULL)
+      return false;
+
+    interp_program_get_kernel = *(gbe_program_get_kernel_cb**)dlsym(dlhInterp, "gbe_program_get_kernel");
+    if (interp_program_get_kernel == NULL)
+      return false;
+
+    interp_kernel_get_name = *(gbe_kernel_get_name_cb**)dlsym(dlhInterp, "gbe_kernel_get_name");
+    if (interp_kernel_get_name == NULL)
+      return false;
+
+    interp_kernel_get_attributes = *(gbe_kernel_get_attributes_cb**)dlsym(dlhInterp, "gbe_kernel_get_attributes");
+    if (interp_kernel_get_attributes == NULL)
+      return false;
+
+    interp_kernel_get_code = *(gbe_kernel_get_code_cb**)dlsym(dlhInterp, "gbe_kernel_get_code");
+    if (interp_kernel_get_code == NULL)
+      return false;
+
+    interp_kernel_get_code_size = *(gbe_kernel_get_code_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_code_size");
+    if (interp_kernel_get_code_size == NULL)
+      return false;
+
+    interp_kernel_get_arg_num = *(gbe_kernel_get_arg_num_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_num");
+    if (interp_kernel_get_arg_num == NULL)
+      return false;
+
+    interp_kernel_get_arg_size = *(gbe_kernel_get_arg_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_size");
+    if (interp_kernel_get_arg_size == NULL)
+      return false;
+
+    interp_kernel_get_arg_bti = *(gbe_kernel_get_arg_bti_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_bti");
+    if (interp_kernel_get_arg_bti == NULL)
+      return false;
+
+    interp_kernel_get_arg_type = *(gbe_kernel_get_arg_type_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_type");
+    if (interp_kernel_get_arg_type == NULL)
+      return false;
+
+    interp_kernel_get_arg_align = *(gbe_kernel_get_arg_align_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_align");
+    if (interp_kernel_get_arg_align == NULL)
+      return false;
+
+    interp_kernel_get_simd_width = *(gbe_kernel_get_simd_width_cb**)dlsym(dlhInterp, "gbe_kernel_get_simd_width");
+    if (interp_kernel_get_simd_width == NULL)
+      return false;
+
+    interp_kernel_get_curbe_offset = *(gbe_kernel_get_curbe_offset_cb**)dlsym(dlhInterp, "gbe_kernel_get_curbe_offset");
+    if (interp_kernel_get_curbe_offset == NULL)
+      return false;
+
+    interp_kernel_get_curbe_size = *(gbe_kernel_get_curbe_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_curbe_size");
+    if (interp_kernel_get_curbe_size == NULL)
+      return false;
+
+    interp_kernel_get_stack_size = *(gbe_kernel_get_stack_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_stack_size");
+    if (interp_kernel_get_stack_size == NULL)
+      return false;
+
+    interp_kernel_get_scratch_size = *(gbe_kernel_get_scratch_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_scratch_size");
+    if (interp_kernel_get_scratch_size == NULL)
+      return false;
+
+    interp_kernel_get_required_work_group_size = *(gbe_kernel_get_required_work_group_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_required_work_group_size");
+    if (interp_kernel_get_required_work_group_size == NULL)
+      return false;
+
+    interp_kernel_use_slm = *(gbe_kernel_use_slm_cb**)dlsym(dlhInterp, "gbe_kernel_use_slm");
+    if (interp_kernel_use_slm == NULL)
+      return false;
+
+    interp_kernel_get_slm_size = *(gbe_kernel_get_slm_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_slm_size");
+    if (interp_kernel_get_slm_size == NULL)
+      return false;
+
+    interp_kernel_get_sampler_size = *(gbe_kernel_get_sampler_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_sampler_size");
+    if (interp_kernel_get_sampler_size == NULL)
+      return false;
+
+    interp_kernel_get_sampler_data = *(gbe_kernel_get_sampler_data_cb**)dlsym(dlhInterp, "gbe_kernel_get_sampler_data");
+    if (interp_kernel_get_sampler_data == NULL)
+      return false;
+
+    interp_kernel_get_compile_wg_size = *(gbe_kernel_get_compile_wg_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_compile_wg_size");
+    if (interp_kernel_get_compile_wg_size == NULL)
+      return false;
+
+    interp_kernel_get_image_size = *(gbe_kernel_get_image_size_cb**)dlsym(dlhInterp, "gbe_kernel_get_image_size");
+    if (interp_kernel_get_image_size == NULL)
+      return false;
+
+    interp_kernel_get_image_data = *(gbe_kernel_get_image_data_cb**)dlsym(dlhInterp, "gbe_kernel_get_image_data");
+    if (interp_kernel_get_image_data == NULL)
+      return false;
+
+    interp_get_printf_num = *(gbe_get_printf_num_cb**)dlsym(dlhInterp, "gbe_get_printf_num");
+    if (interp_get_printf_num == NULL)
+      return false;
+
+    interp_get_printf_buf_bti = *(gbe_get_printf_buf_bti_cb**)dlsym(dlhInterp, "gbe_get_printf_buf_bti");
+    if (interp_get_printf_buf_bti == NULL)
+      return false;
+
+    interp_get_printf_indexbuf_bti = *(gbe_get_printf_indexbuf_bti_cb**)dlsym(dlhInterp, "gbe_get_printf_indexbuf_bti");
+    if (interp_get_printf_indexbuf_bti == NULL)
+      return false;
+
+    interp_dup_printfset = *(gbe_dup_printfset_cb**)dlsym(dlhInterp, "gbe_dup_printfset");
+    if (interp_dup_printfset == NULL)
+      return false;
+
+    interp_get_printf_sizeof_size = *(gbe_get_printf_sizeof_size_cb**)dlsym(dlhInterp, "gbe_get_printf_sizeof_size");
+    if (interp_get_printf_sizeof_size == NULL)
+      return false;
+
+    interp_release_printf_info = *(gbe_release_printf_info_cb**)dlsym(dlhInterp, "gbe_release_printf_info");
+    if (interp_release_printf_info == NULL)
+      return false;
+
+    interp_output_printf = *(gbe_output_printf_cb**)dlsym(dlhInterp, "gbe_output_printf");
+    if (interp_output_printf == NULL)
+      return false;
+
+    interp_kernel_get_arg_info = *(gbe_kernel_get_arg_info_cb**)dlsym(dlhInterp, "gbe_kernel_get_arg_info");
+    if (interp_kernel_get_arg_info == NULL)
+      return false;
+
+    return true;
+  }
+
+  void LoadCompiler()
+  {
+    compilerLoaded = false;
+
+    const char* nonCompiler = getenv("OCL_NON_COMPILER");
+    if (nonCompiler != NULL) {
+      if (strcmp(nonCompiler, "1") == 0)
+        return;
+    }
+
+    const char* gbePath = getenv("OCL_GBE_PATH");
+    if (gbePath == NULL)
+      gbePath = GBE_OBJECT_DIR;
+
+    dlhCompiler = dlopen(gbePath, RTLD_LAZY | RTLD_LOCAL);
+    if (dlhCompiler != NULL) {
+      compiler_program_new_from_source = *(gbe_program_new_from_source_cb **)dlsym(dlhCompiler, "gbe_program_new_from_source");
+      if (compiler_program_new_from_source == NULL)
+        return;
+
+      compiler_program_compile_from_source = *(gbe_program_compile_from_source_cb **)dlsym(dlhCompiler, "gbe_program_compile_from_source");
+      if (compiler_program_compile_from_source == NULL)
+        return;
+
+      compiler_program_new_gen_program = *(gbe_program_new_gen_program_cb **)dlsym(dlhCompiler, "gbe_program_new_gen_program");
+      if (compiler_program_new_gen_program == NULL)
+        return;
+
+      compiler_program_link_program = *(gbe_program_link_program_cb **)dlsym(dlhCompiler, "gbe_program_link_program");
+      if (compiler_program_link_program == NULL)
+        return;
+
+      compiler_program_build_from_llvm = *(gbe_program_build_from_llvm_cb **)dlsym(dlhCompiler, "gbe_program_build_from_llvm");
+      if (compiler_program_build_from_llvm == NULL)
+        return;
+
+      compiler_program_new_from_llvm_binary = *(gbe_program_new_from_llvm_binary_cb **)dlsym(dlhCompiler, "gbe_program_new_from_llvm_binary");
+      if (compiler_program_new_from_llvm_binary == NULL)
+        return;
+
+      compiler_program_serialize_to_binary = *(gbe_program_serialize_to_binary_cb **)dlsym(dlhCompiler, "gbe_program_serialize_to_binary");
+      if (compiler_program_serialize_to_binary == NULL)
+        return;
+
+      compiler_program_new_from_llvm = *(gbe_program_new_from_llvm_cb **)dlsym(dlhCompiler, "gbe_program_new_from_llvm");
+      if (compiler_program_new_from_llvm == NULL)
+        return;
+
+      compiler_program_clean_llvm_resource = *(gbe_program_clean_llvm_resource_cb **)dlsym(dlhCompiler, "gbe_program_clean_llvm_resource");
+      if (compiler_program_clean_llvm_resource == NULL)
+        return;
+
+      compilerLoaded = true;
+    }
+  }
+
+  ~GbeLoaderInitializer()
+  {
+    if (dlhCompiler != NULL)
+      dlclose(dlhCompiler);
+
+    if (dlhInterp != NULL)
+      dlclose(dlhInterp);
+  }
+
+  bool compilerLoaded;
+  void *dlhCompiler;
+  void *dlhInterp;
+};
+
+static struct GbeLoaderInitializer gbeLoader;
+
+int CompilerSupported()
+{
+  if (gbeLoader.compilerLoaded)
+    return 1;
+  else
+    return 0;
+}
diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
new file mode 100644
index 0000000..da9d034
--- /dev/null
+++ b/src/cl_gbe_loader.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __CL_GBE_LOADER_H__
+#define __CL_GBE_LOADER_H__
+
+#include "program.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern gbe_program_new_from_source_cb *compiler_program_new_from_source;
+extern gbe_program_compile_from_source_cb *compiler_program_compile_from_source;
+extern gbe_program_new_gen_program_cb *compiler_program_new_gen_program;
+extern gbe_program_link_program_cb *compiler_program_link_program;
+extern gbe_program_build_from_llvm_cb *compiler_program_build_from_llvm;
+extern gbe_program_new_from_llvm_binary_cb *compiler_program_new_from_llvm_binary;
+extern gbe_program_serialize_to_binary_cb *compiler_program_serialize_to_binary;
+extern gbe_program_new_from_llvm_cb *compiler_program_new_from_llvm;
+extern gbe_program_clean_llvm_resource_cb *compiler_program_clean_llvm_resource;
+
+extern gbe_program_new_from_binary_cb *interp_program_new_from_binary;
+extern gbe_program_get_global_constant_size_cb *interp_program_get_global_constant_size;
+extern gbe_program_get_global_constant_data_cb *interp_program_get_global_constant_data;
+extern gbe_program_delete_cb *interp_program_delete;
+extern gbe_program_get_kernel_num_cb *interp_program_get_kernel_num;
+extern gbe_program_get_kernel_by_name_cb *interp_program_get_kernel_by_name;
+extern gbe_program_get_kernel_cb *interp_program_get_kernel;
+extern gbe_kernel_get_name_cb *interp_kernel_get_name;
+extern gbe_kernel_get_attributes_cb *interp_kernel_get_attributes;
+extern gbe_kernel_get_code_cb *interp_kernel_get_code;
+extern gbe_kernel_get_code_size_cb *interp_kernel_get_code_size;
+extern gbe_kernel_get_arg_num_cb *interp_kernel_get_arg_num;
+extern gbe_kernel_get_arg_size_cb *interp_kernel_get_arg_size;
+extern gbe_kernel_get_arg_bti_cb *interp_kernel_get_arg_bti;
+extern gbe_kernel_get_arg_type_cb *interp_kernel_get_arg_type;
+extern gbe_kernel_get_arg_align_cb *interp_kernel_get_arg_align;
+extern gbe_kernel_get_simd_width_cb *interp_kernel_get_simd_width;
+extern gbe_kernel_get_curbe_offset_cb *interp_kernel_get_curbe_offset;
+extern gbe_kernel_get_curbe_size_cb *interp_kernel_get_curbe_size;
+extern gbe_kernel_get_stack_size_cb *interp_kernel_get_stack_size;
+extern gbe_kernel_get_scratch_size_cb *interp_kernel_get_scratch_size;
+extern gbe_kernel_get_required_work_group_size_cb *interp_kernel_get_required_work_group_size;
+extern gbe_kernel_use_slm_cb *interp_kernel_use_slm;
+extern gbe_kernel_get_slm_size_cb *interp_kernel_get_slm_size;
+extern gbe_kernel_get_sampler_size_cb *interp_kernel_get_sampler_size;
+extern gbe_kernel_get_sampler_data_cb *interp_kernel_get_sampler_data;
+extern gbe_kernel_get_compile_wg_size_cb *interp_kernel_get_compile_wg_size;
+extern gbe_kernel_get_image_size_cb *interp_kernel_get_image_size;
+extern gbe_kernel_get_image_data_cb *interp_kernel_get_image_data;
+extern gbe_get_printf_num_cb* interp_get_printf_num;
+extern gbe_get_printf_buf_bti_cb* interp_get_printf_buf_bti;
+extern gbe_get_printf_indexbuf_bti_cb* interp_get_printf_indexbuf_bti;
+extern gbe_dup_printfset_cb* interp_dup_printfset;
+extern gbe_get_printf_sizeof_size_cb* interp_get_printf_sizeof_size;
+extern gbe_release_printf_info_cb* interp_release_printf_info;
+extern gbe_output_printf_cb* interp_output_printf;
+extern gbe_kernel_get_arg_info_cb *interp_kernel_get_arg_info;
+
+int CompilerSupported();
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CL_GBE_LOADER_H__ */
diff --git a/src/cl_gen75_device.h b/src/cl_gen75_device.h
new file mode 100644
index 0000000..682ee06
--- /dev/null
+++ b/src/cl_gen75_device.h
@@ -0,0 +1,30 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/* Common fields for both SNB devices (either GT1 or GT2)
+ */
+.max_parameter_size = 1024, 
+.global_mem_cache_line_size = 128, /* XXX */
+.global_mem_cache_size = 8 << 10, /* XXX */
+.local_mem_type = CL_GLOBAL,
+.local_mem_size = 64 << 10,
+.scratch_mem_size = 2 << 20,
+
+#include "cl_gt_device.h"
+
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h
new file mode 100644
index 0000000..69cc0b9
--- /dev/null
+++ b/src/cl_gen7_device.h
@@ -0,0 +1,29 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/* Common fields for both IVB devices (either GT1 or GT2) */
+.max_parameter_size = 1024, 
+.global_mem_cache_line_size = 128, /* XXX */
+.global_mem_cache_size = 8 << 10, /* XXX */
+.local_mem_type = CL_GLOBAL,
+.local_mem_size = 64 << 10,
+.scratch_mem_size = 12 << 10,
+
+#include "cl_gt_device.h"
+
diff --git a/src/cl_gl_api.c b/src/cl_gl_api.c
new file mode 100644
index 0000000..04dde5a
--- /dev/null
+++ b/src/cl_gl_api.c
@@ -0,0 +1,153 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Zhigang Gong <zhigang.gong at intel.com>
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#ifdef HAS_EGL
+#include <GL/gl.h>
+#endif
+
+#include "cl_platform_id.h"
+#include "cl_device_id.h" 
+#include "cl_context.h"
+#include "cl_command_queue.h"
+#include "cl_program.h"
+#include "cl_kernel.h"
+#include "cl_mem.h"
+#include "cl_image.h"
+#include "cl_sampler.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+
+#include "CL/cl.h"
+#include "CL/cl_gl.h"
+#include "CL/cl_intel.h"
+#include "cl_mem_gl.h"
+
+#define CHECK_GL_CONTEXT(CTX)                             \
+do {                                                      \
+  if (UNLIKELY(CTX->props.gl_type == CL_GL_NOSHARE)) {    \
+    err = CL_INVALID_CONTEXT;                             \
+    goto error;                                           \
+  }                                                       \
+} while (0)
+
+cl_mem
+clCreateFromGLBuffer(cl_context    context,
+                     cl_mem_flags  flags,
+                     GLuint        bufobj,
+                     cl_int *      errcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+  CHECK_GL_CONTEXT (context);
+
+  mem = cl_mem_new_gl_buffer(context, flags, bufobj, &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+}
+
+cl_mem
+clCreateFromGLTexture2D(cl_context    context,
+                        cl_mem_flags  flags,
+                        GLenum texture_target,
+                        GLint miplevel,
+                        GLuint texture,
+                        cl_int *      errcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+  CHECK_GL_CONTEXT (context);
+
+  mem = cl_mem_new_gl_texture(context, flags, texture_target, miplevel, texture, &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+}
+
+cl_mem
+clCreateFromGLTexture3D(cl_context    context,
+                        cl_mem_flags  flags,
+                        GLenum texture_target,
+                        GLint miplevel,
+                        GLuint texture,
+                        cl_int *      errcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+  CHECK_GL_CONTEXT (context);
+
+  mem = cl_mem_new_gl_texture(context, flags, texture_target, miplevel, texture, &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+}
+
+cl_mem
+clCreateFromGLTexture(cl_context      context,
+                      cl_mem_flags    flags,
+                      cl_GLenum       target,
+                      cl_GLint        miplevel,
+                      cl_GLuint       texture,
+                      cl_int *        errcode_ret)
+{
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT (context);
+  CHECK_GL_CONTEXT (context);
+
+  mem = cl_mem_new_gl_texture(context, flags, target, miplevel, texture, &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+
+}
+
+/* XXX NULL function currently. */
+cl_int clEnqueueAcquireGLObjects (cl_command_queue command_queue,
+                                  cl_uint num_objects,
+                                  const cl_mem *mem_objects,
+                                  cl_uint num_events_in_wait_list,
+                                  const cl_event *event_wait_list,
+                                  cl_event *event)
+{
+  cl_int err = CL_SUCCESS;
+  return err;
+}
+
+/* XXX NULL function currently. */
+cl_int clEnqueueReleaseGLObjects (cl_command_queue command_queue,
+                                  cl_uint num_objects,
+                                  const cl_mem *mem_objects,
+                                  cl_uint num_events_in_wait_list,
+                                  const cl_event *event_wait_list,
+                                  cl_event *event)
+{
+  cl_int err = CL_SUCCESS;
+  return err;
+}
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
new file mode 100644
index 0000000..e2fcee3
--- /dev/null
+++ b/src/cl_gt_device.h
@@ -0,0 +1,124 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/* Common fields for both all GT devices (IVB / SNB) */
+.device_type = CL_DEVICE_TYPE_GPU,
+.vendor_id = 0, /* == device_id (set when requested) */
+.max_work_item_dimensions = 3,
+.max_1d_global_work_sizes = {1024 * 1024 * 256, 1, 1},
+.max_2d_global_work_sizes = {8192, 8192, 1},
+.max_3d_global_work_sizes = {8192, 8192, 2048},
+.preferred_vector_width_char = 8,
+.preferred_vector_width_short = 8,
+.preferred_vector_width_int = 4,
+.preferred_vector_width_long = 2,
+.preferred_vector_width_float = 4,
+.preferred_vector_width_double = 0,
+.preferred_vector_width_half = 0,
+.native_vector_width_char = 8,
+.native_vector_width_short = 8,
+.native_vector_width_int = 4,
+.native_vector_width_long = 2,
+.native_vector_width_float = 4,
+.native_vector_width_double = 2,
+.native_vector_width_half = 8,
+.preferred_wg_sz_mul = 16,
+.address_bits = 32,
+.max_mem_alloc_size = 256 * 1024 * 1024,
+.image_support = CL_TRUE,
+.max_read_image_args = 128,
+.max_write_image_args = 8,
+.image_max_array_size = 2048,
+.image2d_max_width = 8192,
+.image2d_max_height = 8192,
+.image3d_max_width = 8192,
+.image3d_max_height = 8192,
+.image3d_max_depth = 2048,
+.image_mem_size = 8192,
+.max_samplers = 16,
+.mem_base_addr_align = sizeof(cl_long) * 16 * 8,
+.min_data_type_align_size = sizeof(cl_long) * 16,
+.single_fp_config = 0, /* XXX */
+.double_fp_config = 0,
+.global_mem_cache_type = CL_READ_WRITE_CACHE,
+.global_mem_size = 1024 * 1024 * 1024,
+.max_constant_buffer_size = 512 << 10,
+.max_constant_args = 8,
+.error_correction_support = CL_FALSE,
+.host_unified_memory = CL_FALSE,
+.profiling_timer_resolution = 80, /* ns */
+.endian_little = CL_TRUE,
+.available = CL_TRUE,
+.compiler_available = CL_TRUE,
+.linker_available = CL_TRUE,
+.execution_capabilities = CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL,
+.queue_properties = CL_QUEUE_PROFILING_ENABLE,
+.platform = NULL, /* == intel_platform (set when requested) */
+/* IEEE 754, XXX does IVB support CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT? */
+.single_fp_config = CL_FP_INF_NAN | CL_FP_ROUND_TO_NEAREST , /* IEEE 754. */
+.printf_buffer_size = 1 * 1024 * 1024,
+.interop_user_sync = CL_TRUE,
+
+#define DECL_INFO_STRING(FIELD, STRING) \
+    .FIELD = STRING,                    \
+    .JOIN(FIELD,_sz) = sizeof(STRING),
+DECL_INFO_STRING(name, "Intel HD Graphics Family")
+DECL_INFO_STRING(vendor, "Intel")
+DECL_INFO_STRING(version, LIBCL_VERSION_STRING)
+DECL_INFO_STRING(profile, "FULL_PROFILE")
+DECL_INFO_STRING(opencl_c_version, LIBCL_C_VERSION_STRING)
+DECL_INFO_STRING(extensions, "")
+DECL_INFO_STRING(built_in_kernels, "__cl_copy_region_align4;"
+                                   "__cl_copy_region_align16;"
+                                   "__cl_cpy_region_unalign_same_offset;"
+                                   "__cl_copy_region_unalign_dst_offset;"
+                                   "__cl_copy_region_unalign_src_offset;"
+                                   "__cl_copy_buffer_rect;"
+                                   "__cl_copy_image_1d_to_1d;"
+                                   "__cl_copy_image_2d_to_2d;"
+                                   "__cl_copy_image_3d_to_2d;"
+                                   "__cl_copy_image_2d_to_3d;"
+                                   "__cl_copy_image_3d_to_3d;"
+                                   "__cl_copy_image_2d_to_buffer;"
+                                   "__cl_copy_image_3d_to_buffer;"
+                                   "__cl_copy_buffer_to_image_2d;"
+                                   "__cl_copy_buffer_to_image_3d;"
+                                   "__cl_fill_region_unalign;"
+                                   "__cl_fill_region_align2;"
+                                   "__cl_fill_region_align4;"
+                                   "__cl_fill_region_align8_2;"
+                                   "__cl_fill_region_align8_4;"
+                                   "__cl_fill_region_align8_8;"
+                                   "__cl_fill_region_align8_16;"
+                                   "__cl_fill_region_align128;"
+                                   "__cl_fill_image_1d;"
+                                   "__cl_fill_image_1d_array;"
+                                   "__cl_fill_image_2d;"
+                                   "__cl_fill_image_2d_array;"
+                                   "__cl_fill_image_3d;")
+
+DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING)
+#undef DECL_INFO_STRING
+.parent_device = NULL,
+.partition_max_sub_device = 1,
+.partition_property = {0},
+.affinity_domain = 0,
+.partition_type = {0},
+.device_reference_count = 1,
+
diff --git a/src/cl_image.c b/src/cl_image.c
new file mode 100644
index 0000000..ced9789
--- /dev/null
+++ b/src/cl_image.c
@@ -0,0 +1,229 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_image.h"
+#include "cl_utils.h"
+#include "intel/intel_defines.h"
+
+#include <assert.h>
+
+LOCAL cl_int
+cl_image_byte_per_pixel(const cl_image_format *fmt, uint32_t *bpp)
+{
+  assert(bpp);
+
+  if(fmt == NULL)
+    return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+
+  const uint32_t type = fmt->image_channel_data_type;
+  const uint32_t order = fmt->image_channel_order;
+  switch (type) {
+#define DECL_BPP(DATA_TYPE, VALUE) case DATA_TYPE: *bpp = VALUE;
+    DECL_BPP(CL_SNORM_INT8, 1); break;
+    DECL_BPP(CL_SNORM_INT16, 2); break;
+    DECL_BPP(CL_UNORM_INT8, 1); break;
+    DECL_BPP(CL_UNORM_INT16, 2); break;
+    DECL_BPP(CL_UNORM_SHORT_565, 2);
+      if (order != CL_RGBx && order != CL_RGB)
+        return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    break;
+    DECL_BPP(CL_UNORM_SHORT_555, 2);
+      if (order != CL_RGBx && order != CL_RGB)
+        return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    break;
+    DECL_BPP(CL_UNORM_INT_101010, 4);
+      if (order != CL_RGBx && order != CL_RGB)
+        return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    break;
+    DECL_BPP(CL_SIGNED_INT8, 1); break;
+    DECL_BPP(CL_SIGNED_INT16, 2); break;
+    DECL_BPP(CL_SIGNED_INT32, 4); break;
+    DECL_BPP(CL_UNSIGNED_INT8, 1); break;
+    DECL_BPP(CL_UNSIGNED_INT16, 2); break;
+    DECL_BPP(CL_UNSIGNED_INT32, 4); break;
+    DECL_BPP(CL_HALF_FLOAT, 2); break;
+    DECL_BPP(CL_FLOAT, 4); break;
+#undef DECL_BPP
+    default: return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+  };
+
+  switch (order) {
+    case CL_Rx: break;
+    case CL_R: break;
+    case CL_A: break;
+    case CL_RA: *bpp *= 2; break;
+    case CL_RG: *bpp *= 2; break;
+    case CL_INTENSITY:
+    case CL_LUMINANCE:
+      if (type != CL_UNORM_INT8 && type != CL_UNORM_INT16 &&
+          type != CL_SNORM_INT8 && type != CL_SNORM_INT16 &&
+          type != CL_HALF_FLOAT && type != CL_FLOAT)
+        return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    break;
+    case CL_RGB:
+    case CL_RGBx:
+      if (type != CL_UNORM_SHORT_555 &&
+          type != CL_UNORM_SHORT_565 &&
+          type != CL_UNORM_INT_101010)
+        return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    break;
+    case CL_RGBA: *bpp *= 4; break;
+    case CL_ARGB:
+    case CL_BGRA:
+      if (type != CL_UNORM_INT8 && type != CL_SIGNED_INT8 &&
+          type != CL_SNORM_INT8 && type != CL_UNSIGNED_INT8)
+        return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+      *bpp *= 4;
+    break;
+    default: return CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+  };
+
+  return CL_SUCCESS;
+}
+
+LOCAL uint32_t
+cl_image_get_intel_format(const cl_image_format *fmt)
+{
+  const uint32_t type = fmt->image_channel_data_type;
+  const uint32_t order = fmt->image_channel_order;
+  switch (order) {
+    case CL_R:
+#if 0
+    case CL_Rx:
+    case CL_A:
+    case CL_INTENSITY:
+    case CL_LUMINANCE:
+      if ((order == CL_INTENSITY || order == CL_LUMINANCE)
+          && (type != CL_UNORM_INT8 && type != CL_UNORM_INT16
+              && type != CL_SNORM_INT8 && type != CL_SNORM_INT16
+              && type != CL_HALF_FLOAT && type != CL_FLOAT))
+        return INTEL_UNSUPPORTED_FORMAT;
+#endif
+
+/* XXX it seems we have some acuracy compatible issue with snomr_int8/16,
+ * have to disable those formats currently. */
+
+      switch (type) {
+        case CL_HALF_FLOAT:     return I965_SURFACEFORMAT_R16_FLOAT;
+        case CL_FLOAT:          return I965_SURFACEFORMAT_R32_FLOAT;
+//        case CL_SNORM_INT16:    return I965_SURFACEFORMAT_R16_SNORM;
+//        case CL_SNORM_INT8:     return I965_SURFACEFORMAT_R8_SNORM;
+        case CL_UNORM_INT8:     return I965_SURFACEFORMAT_R8_UNORM;
+        case CL_UNORM_INT16:    return I965_SURFACEFORMAT_R16_UNORM;
+        case CL_SIGNED_INT8:    return I965_SURFACEFORMAT_R8_SINT;
+        case CL_SIGNED_INT16:   return I965_SURFACEFORMAT_R16_SINT;
+        case CL_SIGNED_INT32:   return I965_SURFACEFORMAT_R32_SINT;
+        case CL_UNSIGNED_INT8:  return I965_SURFACEFORMAT_R8_UINT;
+        case CL_UNSIGNED_INT16: return I965_SURFACEFORMAT_R16_UINT;
+        case CL_UNSIGNED_INT32: return I965_SURFACEFORMAT_R32_UINT;
+        default: return INTEL_UNSUPPORTED_FORMAT;
+      };
+#if 0
+    case CL_RG:
+    case CL_RA:
+      switch (type) {
+        case CL_HALF_FLOAT:     return I965_SURFACEFORMAT_R16G16_FLOAT;
+        case CL_FLOAT:          return I965_SURFACEFORMAT_R32G32_FLOAT;
+        case CL_SNORM_INT16:    return I965_SURFACEFORMAT_R16G16_SNORM;
+        case CL_SNORM_INT8:     return I965_SURFACEFORMAT_R8G8_SNORM;
+        case CL_UNORM_INT8:     return I965_SURFACEFORMAT_R8G8_UNORM;
+        case CL_UNORM_INT16:    return I965_SURFACEFORMAT_R16G16_UNORM;
+        case CL_SIGNED_INT8:    return I965_SURFACEFORMAT_R8G8_SINT;
+        case CL_SIGNED_INT16:   return I965_SURFACEFORMAT_R16G16_SINT;
+        case CL_SIGNED_INT32:   return I965_SURFACEFORMAT_R32G32_SINT;
+        case CL_UNSIGNED_INT8:  return I965_SURFACEFORMAT_R8G8_UINT;
+        case CL_UNSIGNED_INT16: return I965_SURFACEFORMAT_R16G16_UINT;
+        case CL_UNSIGNED_INT32: return I965_SURFACEFORMAT_R32G32_UINT;
+        default: return INTEL_UNSUPPORTED_FORMAT;
+      };
+    case CL_RGB:
+    case CL_RGBx:
+      switch (type) {
+        case CL_UNORM_INT_101010: return I965_SURFACEFORMAT_R10G10B10A2_UNORM;
+        case CL_UNORM_SHORT_565:
+        case CL_UNORM_SHORT_555:
+        default: return INTEL_UNSUPPORTED_FORMAT;
+      };
+#endif
+    case CL_RGBA:
+      switch (type) {
+        case CL_HALF_FLOAT:     return I965_SURFACEFORMAT_R16G16B16A16_FLOAT;
+        case CL_FLOAT:          return I965_SURFACEFORMAT_R32G32B32A32_FLOAT;
+//        case CL_SNORM_INT16:    return I965_SURFACEFORMAT_R16G16B16A16_SNORM;
+//        case CL_SNORM_INT8:     return I965_SURFACEFORMAT_R8G8B8A8_SNORM;
+        case CL_UNORM_INT8:     return I965_SURFACEFORMAT_R8G8B8A8_UNORM;
+        case CL_UNORM_INT16:    return I965_SURFACEFORMAT_R16G16B16A16_UNORM;
+        case CL_SIGNED_INT8:    return I965_SURFACEFORMAT_R8G8B8A8_SINT;
+        case CL_SIGNED_INT16:   return I965_SURFACEFORMAT_R16G16B16A16_SINT;
+        case CL_SIGNED_INT32:   return I965_SURFACEFORMAT_R32G32B32A32_SINT;
+        case CL_UNSIGNED_INT8:  return I965_SURFACEFORMAT_R8G8B8A8_UINT;
+        case CL_UNSIGNED_INT16: return I965_SURFACEFORMAT_R16G16B16A16_UINT;
+        case CL_UNSIGNED_INT32: return I965_SURFACEFORMAT_R32G32B32A32_UINT;
+        default: return INTEL_UNSUPPORTED_FORMAT;
+      };
+    case CL_ARGB: return INTEL_UNSUPPORTED_FORMAT;
+    case CL_BGRA:
+      switch (type) {
+        case CL_UNORM_INT8:     return I965_SURFACEFORMAT_B8G8R8A8_UNORM;
+        default: return INTEL_UNSUPPORTED_FORMAT;
+      };
+    default: return INTEL_UNSUPPORTED_FORMAT;
+  };
+}
+
+static const uint32_t cl_image_order[] = {
+  CL_R, CL_A, CL_RG, CL_RA, CL_RGB, CL_RGBA, CL_BGRA, CL_ARGB,
+  CL_INTENSITY, CL_LUMINANCE, CL_Rx, CL_RGx, CL_RGBx
+};
+
+static const uint32_t cl_image_type[] = {
+  CL_SNORM_INT8, CL_SNORM_INT16, CL_UNORM_INT8, CL_UNORM_INT16,
+  CL_UNORM_SHORT_565, CL_UNORM_SHORT_555, CL_UNORM_INT_101010,
+  CL_SIGNED_INT8, CL_SIGNED_INT16, CL_SIGNED_INT32,
+  CL_UNSIGNED_INT8, CL_UNSIGNED_INT16, CL_UNSIGNED_INT32,
+  CL_HALF_FLOAT, CL_FLOAT
+};
+
+static const size_t cl_image_order_n = SIZEOF32(cl_image_order);
+static const size_t cl_image_type_n = SIZEOF32(cl_image_type);
+
+cl_int
+cl_image_get_supported_fmt(cl_context ctx,
+                           cl_mem_object_type image_type,
+                           cl_uint num_entries,
+                           cl_image_format *image_formats,
+                           cl_uint *num_image_formats)
+{
+  size_t i, j, n = 0;
+  for (i = 0; i < cl_image_order_n; ++i)
+  for (j = 0; j < cl_image_type_n; ++j) {
+    const cl_image_format fmt = {
+      .image_channel_order = cl_image_order[i],
+      .image_channel_data_type = cl_image_type[j]
+    };
+    const uint32_t intel_fmt = cl_image_get_intel_format(&fmt);
+    if (intel_fmt == INTEL_UNSUPPORTED_FORMAT)
+      continue;
+    if (n < num_entries && image_formats) image_formats[n] = fmt;
+    n++;
+  }
+  if (num_image_formats) *num_image_formats = n;
+  return CL_SUCCESS;
+}
+
diff --git a/src/cl_image.h b/src/cl_image.h
new file mode 100644
index 0000000..86cc76a
--- /dev/null
+++ b/src/cl_image.h
@@ -0,0 +1,44 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_IMAGE_H__
+#define __CL_IMAGE_H__
+
+#include "cl_internals.h"
+#include "CL/cl.h"
+#include <stdint.h>
+
+/* Returned when the OCL format is not supported */
+#define INTEL_UNSUPPORTED_FORMAT ((uint32_t) ~0x0u)
+
+/* Compute the number of bytes per pixel if the format is supported */
+extern cl_int cl_image_byte_per_pixel(const cl_image_format *fmt, uint32_t *bpp);
+
+/* Return the intel format for the given OCL format */
+extern uint32_t cl_image_get_intel_format(const cl_image_format *fmt);
+
+/* Return the list of formats supported by the API */
+extern cl_int cl_image_get_supported_fmt(cl_context context,
+                                         cl_mem_object_type image_type,
+                                         cl_uint num_entries,
+                                         cl_image_format *image_formats,
+                                         cl_uint *num_image_formats);
+
+#endif /* __CL_IMAGE_H__ */
+
diff --git a/src/cl_internals.h b/src/cl_internals.h
new file mode 100644
index 0000000..693de1d
--- /dev/null
+++ b/src/cl_internals.h
@@ -0,0 +1,36 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_INTERNALS_H__
+#define __CL_INTERNALS_H__
+
+/* We put a header to identify each object. This will make the programmer life
+ * easy if objects are wrongly used in the API
+ */
+#define CL_MAGIC_KERNEL_HEADER    0x1234567890abcdefLL
+#define CL_MAGIC_CONTEXT_HEADER   0x0ab123456789cdefLL
+#define CL_MAGIC_PROGRAM_HEADER   0x34560ab12789cdefLL
+#define CL_MAGIC_QUEUE_HEADER     0x83650a12b79ce4dfLL
+#define CL_MAGIC_SAMPLER_HEADER   0x686a0ecba79ce33fLL
+#define CL_MAGIC_EVENT_HEADER     0x8324a9c810ebf90fLL
+#define CL_MAGIC_MEM_HEADER       0x381a27b9ce6504dfLL
+#define CL_MAGIC_DEAD_HEADER      0xdeaddeaddeaddeadLL
+
+#endif /* __CL_INTERNALS_H__ */
+
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
new file mode 100644
index 0000000..55b707a
--- /dev/null
+++ b/src/cl_kernel.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_kernel.h"
+#include "cl_program.h"
+#include "cl_device_id.h"
+#include "cl_context.h"
+#include "cl_mem.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+#include "cl_khr_icd.h"
+#include "CL/cl.h"
+#include "cl_sampler.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+
+LOCAL void
+cl_kernel_delete(cl_kernel k)
+{
+  uint32_t i;
+  if (k == NULL) return;
+
+  /* We are not done with the kernel */
+  if (atomic_dec(&k->ref_n) > 1) return;
+  /* Release one reference on all bos we own */
+  if (k->bo)       cl_buffer_unreference(k->bo);
+  /* This will be true for kernels created by clCreateKernel */
+  if (k->ref_its_program) cl_program_delete(k->program);
+  /* Release the curbe if allocated */
+  if (k->curbe) cl_free(k->curbe);
+  /* Release the argument array if required */
+  if (k->args) {
+    for (i = 0; i < k->arg_n; ++i)
+      if (k->args[i].mem != NULL)
+        cl_mem_delete(k->args[i].mem);
+    cl_free(k->args);
+  }
+  if (k->image_sz)
+    cl_free(k->images);
+  k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+  cl_free(k);
+}
+
+LOCAL cl_kernel
+cl_kernel_new(cl_program p)
+{
+  cl_kernel k = NULL;
+  TRY_ALLOC_NO_ERR (k, CALLOC(struct _cl_kernel));
+  SET_ICD(k->dispatch)
+  k->ref_n = 1;
+  k->magic = CL_MAGIC_KERNEL_HEADER;
+  k->program = p;
+
+exit:
+  return k;
+error:
+  cl_kernel_delete(k);
+  k = NULL;
+  goto exit;
+}
+
+LOCAL const char*
+cl_kernel_get_name(cl_kernel k)
+{
+  if (UNLIKELY(k == NULL)) return NULL;
+  return interp_kernel_get_name(k->opaque);
+}
+
+LOCAL const char*
+cl_kernel_get_attributes(cl_kernel k)
+{
+  if (UNLIKELY(k == NULL)) return NULL;
+  return interp_kernel_get_attributes(k->opaque);
+}
+
+LOCAL void
+cl_kernel_add_ref(cl_kernel k)
+{
+  atomic_inc(&k->ref_n);
+}
+
+LOCAL cl_int
+cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
+{
+  uint32_t offset;            /* where to patch */
+  enum gbe_arg_type arg_type; /* kind of argument */
+  size_t arg_sz;              /* size of the argument */
+  cl_mem mem = NULL;          /* for __global, __constant and image arguments */
+  cl_context ctx = k->program->ctx;
+
+  if (UNLIKELY(index >= k->arg_n))
+    return CL_INVALID_ARG_INDEX;
+  arg_type = interp_kernel_get_arg_type(k->opaque, index);
+  arg_sz = interp_kernel_get_arg_size(k->opaque, index);
+
+  if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) {
+    if (arg_sz == 2 && arg_type == GBE_ARG_VALUE && sz == sizeof(cl_sampler)) {
+      /* FIXME, this is a workaround for the case when a kernel arg
+         defined a sampler_t but doesn't use it.*/
+      arg_type = GBE_ARG_SAMPLER;
+    } else
+      return CL_INVALID_ARG_SIZE;
+  }
+
+  if(UNLIKELY(arg_type == GBE_ARG_LOCAL_PTR && sz == 0))
+    return CL_INVALID_ARG_SIZE;
+  if(arg_type == GBE_ARG_VALUE) {
+    if(UNLIKELY(value == NULL))
+      return CL_INVALID_ARG_VALUE;
+  } else if(arg_type == GBE_ARG_LOCAL_PTR) {
+    if(UNLIKELY(value != NULL))
+      return CL_INVALID_ARG_VALUE;
+  } else if(arg_type == GBE_ARG_SAMPLER) {
+    if (UNLIKELY(value == NULL))
+      return CL_INVALID_ARG_VALUE;
+
+    cl_sampler s = *(cl_sampler*)value;
+    if(s->magic != CL_MAGIC_SAMPLER_HEADER)
+      return CL_INVALID_SAMPLER;
+  } else {
+    // should be image, GLOBAL_PTR, CONSTANT_PTR
+    if (UNLIKELY(value == NULL && arg_type == GBE_ARG_IMAGE))
+      return CL_INVALID_ARG_VALUE;
+    if(value != NULL)
+      mem = *(cl_mem*)value;
+    if(value != NULL && mem) {
+      if( CL_SUCCESS != is_valid_mem(mem, ctx->buffers))
+        return CL_INVALID_MEM_OBJECT;
+
+      if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !IS_IMAGE(mem))
+         || (arg_type != GBE_ARG_IMAGE && IS_IMAGE(mem))))
+          return CL_INVALID_ARG_VALUE;
+    }
+  }
+
+  /* Copy the structure or the value directly into the curbe */
+  if (arg_type == GBE_ARG_VALUE) {
+    offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+    assert(offset + sz <= k->curbe_sz);
+    memcpy(k->curbe + offset, value, sz);
+    k->args[index].local_sz = 0;
+    k->args[index].is_set = 1;
+    k->args[index].mem = NULL;
+    return CL_SUCCESS;
+  }
+
+  /* For a local pointer just save the size */
+  if (arg_type == GBE_ARG_LOCAL_PTR) {
+    k->args[index].local_sz = sz;
+    k->args[index].is_set = 1;
+    k->args[index].mem = NULL;
+    return CL_SUCCESS;
+  }
+
+  /* Is it a sampler*/
+  if (arg_type == GBE_ARG_SAMPLER) {
+    cl_sampler sampler;
+    memcpy(&sampler, value, sz);
+    k->args[index].local_sz = 0;
+    k->args[index].is_set = 1;
+    k->args[index].mem = NULL;
+    k->args[index].sampler = sampler;
+    cl_set_sampler_arg_slot(k, index, sampler);
+    offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+    assert(offset + 2 <= k->curbe_sz);
+    memcpy(k->curbe + offset, &sampler->clkSamplerValue, 2);
+    return CL_SUCCESS;
+  }
+
+  if(value != NULL)
+    mem = *(cl_mem*) value;
+
+  if(value == NULL || mem == NULL) {
+    /* for buffer object GLOBAL_PTR CONSTANT_PTR, it maybe NULL */
+    int32_t offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+    *((uint32_t *)(k->curbe + offset)) = 0;
+    assert(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR);
+
+    if (k->args[index].mem)
+      cl_mem_delete(k->args[index].mem);
+    k->args[index].mem = NULL;
+    k->args[index].is_set = 1;
+    k->args[index].local_sz = 0;
+    return CL_SUCCESS;
+  }
+
+  mem = *(cl_mem*) value;
+
+  cl_mem_add_ref(mem);
+  if (k->args[index].mem)
+    cl_mem_delete(k->args[index].mem);
+  k->args[index].mem = mem;
+  k->args[index].is_set = 1;
+  k->args[index].local_sz = 0;
+  k->args[index].bti = interp_kernel_get_arg_bti(k->opaque, index);
+  return CL_SUCCESS;
+}
+
+LOCAL int
+cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_name,
+                       size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+{
+  assert(k != NULL);
+  void *ret_info = interp_kernel_get_arg_info(k->opaque, arg_index,
+                           param_name - CL_KERNEL_ARG_ADDRESS_QUALIFIER);
+  int str_len = 0;
+  cl_kernel_arg_type_qualifier type_qual = CL_KERNEL_ARG_TYPE_NONE;
+
+  switch (param_name) {
+  case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
+    if (param_value_size < sizeof(cl_kernel_arg_address_qualifier))
+      return CL_INVALID_VALUE;
+    if (param_value_size_ret)
+      *param_value_size_ret = sizeof(cl_kernel_arg_address_qualifier);
+    if (!param_value) return CL_SUCCESS;
+    if ((cl_ulong)ret_info == 0) {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_PRIVATE;
+    } else if ((cl_ulong)ret_info == 1 || (cl_ulong)ret_info == 4) {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_GLOBAL;
+    } else if ((cl_ulong)ret_info == 2) {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_CONSTANT;
+    } else if ((cl_ulong)ret_info == 3) {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_LOCAL;
+    } else {
+      /* If no address qualifier is specified, the default address qualifier
+         which is CL_KERNEL_ARG_ADDRESS_PRIVATE is returned. */
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_LOCAL;
+    }
+    return CL_SUCCESS;
+
+  case CL_KERNEL_ARG_ACCESS_QUALIFIER:
+    if (param_value_size < sizeof(cl_kernel_arg_access_qualifier))
+      return CL_INVALID_VALUE;
+    if (param_value_size_ret)
+      *param_value_size_ret = sizeof(cl_kernel_arg_access_qualifier);
+    if (!param_value) return CL_SUCCESS;
+    if (!strcmp((char*)ret_info, "write_only")) {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
+    } else if (!strcmp((char*)ret_info, "read_only")) {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_READ_ONLY;
+    } else if (!strcmp((char*)ret_info, "read_write")) {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_READ_WRITE;
+    } else {
+      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_NONE;
+    }
+    return CL_SUCCESS;
+
+  case CL_KERNEL_ARG_TYPE_NAME:
+  case CL_KERNEL_ARG_NAME:
+    str_len = strlen(ret_info);
+    if (param_value_size < str_len + 1)
+      return CL_INVALID_VALUE;
+    if (param_value_size_ret)
+      *param_value_size_ret = str_len + 1;
+    if (!param_value) return CL_SUCCESS;
+    memcpy(param_value, ret_info, str_len);
+    ((char *)param_value)[str_len] = 0;
+    return CL_SUCCESS;
+
+  case CL_KERNEL_ARG_TYPE_QUALIFIER:
+    if (param_value_size < sizeof(cl_kernel_arg_type_qualifier))
+      return CL_INVALID_VALUE;
+    if (param_value_size_ret)
+      *param_value_size_ret = sizeof(cl_kernel_arg_type_qualifier);
+    if (!param_value) return CL_SUCCESS;
+    if (strstr((char*)ret_info, "const"))
+      type_qual = type_qual | CL_KERNEL_ARG_TYPE_CONST;
+    if (strstr((char*)ret_info, "volatile"))
+      type_qual = type_qual | CL_KERNEL_ARG_TYPE_VOLATILE;
+    if (strstr((char*)ret_info, "restrict"))
+      type_qual = type_qual | CL_KERNEL_ARG_TYPE_RESTRICT;
+    *(cl_kernel_arg_type_qualifier *)param_value = type_qual;
+    return CL_SUCCESS;
+
+  default:
+    assert(0);
+  }
+
+  return CL_SUCCESS;
+}
+
+LOCAL uint32_t
+cl_kernel_get_simd_width(cl_kernel k)
+{
+  assert(k != NULL);
+  return interp_kernel_get_simd_width(k->opaque);
+}
+
+LOCAL void
+cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
+{
+  cl_context ctx = k->program->ctx;
+  cl_buffer_mgr bufmgr = cl_context_get_bufmgr(ctx);
+
+  if(k->bo != NULL)
+    cl_buffer_unreference(k->bo);
+
+  /* Allocate the gen code here */
+  const uint32_t code_sz = interp_kernel_get_code_size(opaque);
+  const char *code = interp_kernel_get_code(opaque);
+  k->bo = cl_buffer_alloc(bufmgr, "CL kernel", code_sz, 64u);
+  k->arg_n = interp_kernel_get_arg_num(opaque);
+
+  /* Upload the code */
+  cl_buffer_subdata(k->bo, 0, code_sz, code);
+  k->opaque = opaque;
+
+  /* Create the curbe */
+  k->curbe_sz = interp_kernel_get_curbe_size(k->opaque);
+
+  /* Get sampler data & size */
+  k->sampler_sz = interp_kernel_get_sampler_size(k->opaque);
+  assert(k->sampler_sz <= GEN_MAX_SAMPLERS);
+  if (k->sampler_sz > 0)
+    interp_kernel_get_sampler_data(k->opaque, k->samplers);
+  interp_kernel_get_compile_wg_size(k->opaque, k->compile_wg_sz);
+  k->stack_size = interp_kernel_get_stack_size(k->opaque);
+  /* Get image data & size */
+  k->image_sz = interp_kernel_get_image_size(k->opaque);
+  assert(k->sampler_sz <= GEN_MAX_SURFACES);
+  if (k->image_sz > 0) {
+    TRY_ALLOC_NO_ERR(k->images, cl_calloc(k->image_sz, sizeof(k->images[0])));
+    interp_kernel_get_image_data(k->opaque, k->images);
+  } else
+    k->images = NULL;
+  return;
+error:
+  cl_buffer_unreference(k->bo);
+  k->bo = NULL;
+}
+
+LOCAL cl_kernel
+cl_kernel_dup(cl_kernel from)
+{
+  cl_kernel to = NULL;
+
+  if (UNLIKELY(from == NULL))
+    return NULL;
+  TRY_ALLOC_NO_ERR (to, CALLOC(struct _cl_kernel));
+  SET_ICD(to->dispatch)
+  to->bo = from->bo;
+  to->opaque = from->opaque;
+  to->ref_n = 1;
+  to->magic = CL_MAGIC_KERNEL_HEADER;
+  to->program = from->program;
+  to->arg_n = from->arg_n;
+  to->curbe_sz = from->curbe_sz;
+  to->sampler_sz = from->sampler_sz;
+  to->image_sz = from->image_sz;
+  memcpy(to->compile_wg_sz, from->compile_wg_sz, sizeof(from->compile_wg_sz));
+  to->stack_size = from->stack_size;
+  if (to->sampler_sz)
+    memcpy(to->samplers, from->samplers, to->sampler_sz * sizeof(uint32_t));
+  if (to->image_sz) {
+    TRY_ALLOC_NO_ERR(to->images, cl_calloc(to->image_sz, sizeof(to->images[0])));
+    memcpy(to->images, from->images, to->image_sz * sizeof(to->images[0]));
+  } else
+    to->images = NULL;
+  TRY_ALLOC_NO_ERR(to->args, cl_calloc(to->arg_n, sizeof(cl_argument)));
+  if (to->curbe_sz) TRY_ALLOC_NO_ERR(to->curbe, cl_calloc(1, to->curbe_sz));
+
+  /* Retain the bos */
+  if (from->bo)       cl_buffer_reference(from->bo);
+
+  /* We retain the program destruction since this kernel (user allocated)
+   * depends on the program for some of its pointers
+   */
+  assert(from->program);
+  cl_program_add_ref(from->program);
+  to->ref_its_program = CL_TRUE;
+
+exit:
+  return to;
+error:
+  cl_kernel_delete(to);
+  to = NULL;
+  goto exit;
+}
+
+LOCAL cl_int
+cl_kernel_work_group_sz(cl_kernel ker,
+                        const size_t *local_wk_sz,
+                        uint32_t wk_dim,
+                        size_t *wk_grp_sz)
+{
+  cl_int err = CL_SUCCESS;
+  size_t sz = 0;
+  cl_uint i;
+
+  for (i = 0; i < wk_dim; ++i) {
+    const uint32_t required_sz = interp_kernel_get_required_work_group_size(ker->opaque, i);
+    if (required_sz != 0 && required_sz != local_wk_sz[i]) {
+      err = CL_INVALID_WORK_ITEM_SIZE;
+      goto error;
+    }
+  }
+  sz = local_wk_sz[0];
+  for (i = 1; i < wk_dim; ++i)
+    sz *= local_wk_sz[i];
+
+  if (sz > cl_get_kernel_max_wg_sz(ker)) {
+    err = CL_INVALID_WORK_ITEM_SIZE;
+    goto error;
+  }
+
+error:
+  if (wk_grp_sz) *wk_grp_sz = sz;
+  return err;
+}
+
+
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
new file mode 100644
index 0000000..1ed90a5
--- /dev/null
+++ b/src/cl_kernel.h
@@ -0,0 +1,116 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_KERNEL_H__
+#define __CL_KERNEL_H__
+
+#include "cl_internals.h"
+#include "cl_driver.h"
+#include "cl_gbe_loader.h"
+#include "CL/cl.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* This is the kernel as it is interfaced by the compiler */
+struct _gbe_kernel;
+
+/* We need to save buffer data for relocation and binding and we must figure out
+ * if all arguments are properly set
+ */
+typedef struct cl_argument {
+  cl_mem mem;           /* For image and regular buffers */
+  cl_sampler sampler;   /* For sampler. */
+  unsigned char bti;
+  uint32_t local_sz:31; /* For __local size specification */
+  uint32_t is_set:1;    /* All args must be set before NDRange */
+} cl_argument;
+
+/* One OCL function */
+struct _cl_kernel {
+  DEFINE_ICD(dispatch)
+  uint64_t magic;             /* To identify it as a kernel */
+  volatile int ref_n;         /* We reference count this object */
+  cl_buffer bo;               /* The code itself */
+  cl_program program;         /* Owns this structure (and pointers) */
+  gbe_kernel opaque;          /* (Opaque) compiler structure for the OCL kernel */
+  char *curbe;                /* One curbe per kernel */
+  size_t curbe_sz;            /* Size of it */
+  uint32_t samplers[GEN_MAX_SAMPLERS]; /* samplers defined in kernel & kernel args */
+  size_t sampler_sz;          /* sampler size defined in kernel & kernel args. */
+  struct ImageInfo *images;   /* images defined in kernel args */
+  size_t image_sz;            /* image count in kernel args */
+  cl_ulong local_mem_sz;      /* local memory size specified in kernel args. */
+  size_t compile_wg_sz[3];    /* Required workgroup size by __attribute__((reqd_work_gro
+                                 up_size(X, Y, Z))) qualifier.*/
+  size_t global_work_sz[3];    /* maximum global size that can be used to execute a kernel
+                                (i.e. global_work_size argument to clEnqueueNDRangeKernel.)*/
+  size_t stack_size;          /* stack size per work item. */
+  cl_argument *args;          /* To track argument setting */
+  uint32_t arg_n:31;          /* Number of arguments */
+  uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
+};
+
+/* Allocate an empty kernel */
+extern cl_kernel cl_kernel_new(cl_program);
+
+/* Destroy and deallocate an empty kernel */
+extern void cl_kernel_delete(cl_kernel);
+
+/* Setup the kernel with the given GBE Kernel */
+extern void cl_kernel_setup(cl_kernel k, gbe_kernel opaque);
+
+/* Get the kernel name */
+extern const char *cl_kernel_get_name(cl_kernel k);
+
+/* Get the kernel attributes*/
+extern const char *cl_kernel_get_attributes(cl_kernel k);
+
+/* Get the simd width as used in the code */
+extern uint32_t cl_kernel_get_simd_width(cl_kernel k);
+
+/* When a kernel is created from outside, we just duplicate the structure we
+ * have internally and give it back to the user
+ */
+extern cl_kernel cl_kernel_dup(cl_kernel);
+
+/* Add one more reference on the kernel object */
+extern void cl_kernel_add_ref(cl_kernel);
+
+/* Set the argument before kernel execution */
+extern int cl_kernel_set_arg(cl_kernel,
+                             uint32_t    arg_index,
+                             size_t      arg_size,
+                             const void *arg_value);
+
+/* Get the argument information */
+extern int cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index,
+                                  cl_kernel_arg_info param_name,
+                                  size_t param_value_size, void *param_value,
+                                  size_t *param_value_size_ret);
+
+/* Compute and check the work group size from the user provided local size */
+extern cl_int
+cl_kernel_work_group_sz(cl_kernel ker,
+                        const size_t *local_wk_sz,
+                        cl_uint wk_dim,
+                        size_t *wk_grp_sz);
+
+#endif /* __CL_KERNEL_H__ */
+
diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c
new file mode 100644
index 0000000..50a0898
--- /dev/null
+++ b/src/cl_khr_icd.c
@@ -0,0 +1,174 @@
+/* 
+ * Copyright © 2013 Simon Richter
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <ocl_icd.h>
+
+#include "cl_platform_id.h"
+
+/* The interop functions are not implemented in Beignet */
+#define CL_GL_INTEROP(x) NULL
+/* OpenCL 1.2 is not implemented in Beignet */
+#define CL_1_2_NOTYET(x) NULL
+
+/** Return platform list through ICD interface
+ * This code is used only if a client is linked directly against the library
+ * instead of using the ICD loader. In this case, no other implementations
+ * should exist in the process address space, so the call is equivalent to
+ * clGetPlatformIDs().
+ *
+ * @param[in]   num_entries     Number of entries allocated in return buffer
+ * @param[out]  platforms       Platform identifiers supported by this implementation
+ * @param[out]  num_platforms   Number of platform identifiers returned
+ * @return      OpenCL error code
+ * @retval      CL_SUCCESS                      Successful execution
+ * @retval      CL_PLATFORM_NOT_FOUND_KHR       No platforms provided
+ * @retval      CL_INVALID_VALUE                Invalid parameters
+ */
+cl_int
+clIcdGetPlatformIDsKHR(cl_uint          num_entries,
+                 cl_platform_id * platforms,
+                 cl_uint *        num_platforms)
+{
+  return clGetPlatformIDs(num_entries, platforms, num_platforms);
+}
+
+struct _cl_icd_dispatch const cl_khr_icd_dispatch = {
+  clGetPlatformIDs,
+  clGetPlatformInfo,
+  clGetDeviceIDs,
+  clGetDeviceInfo,
+  clCreateContext,
+  clCreateContextFromType,
+  clRetainContext,
+  clReleaseContext,
+  clGetContextInfo,
+  clCreateCommandQueue,
+  clRetainCommandQueue,
+  clReleaseCommandQueue,
+  clGetCommandQueueInfo,
+  (void *) NULL, /* clSetCommandQueueProperty */
+  clCreateBuffer,
+  clCreateImage2D,
+  clCreateImage3D,
+  clRetainMemObject,
+  clReleaseMemObject,
+  clGetSupportedImageFormats,
+  clGetMemObjectInfo,
+  clGetImageInfo,
+  clCreateSampler,
+  clRetainSampler,
+  clReleaseSampler,
+  clGetSamplerInfo,
+  clCreateProgramWithSource,
+  clCreateProgramWithBinary,
+  clRetainProgram,
+  clReleaseProgram,
+  clBuildProgram,
+  clUnloadCompiler,
+  clGetProgramInfo,
+  clGetProgramBuildInfo,
+  clCreateKernel,
+  clCreateKernelsInProgram,
+  clRetainKernel,
+  clReleaseKernel,
+  clSetKernelArg,
+  clGetKernelInfo,
+  clGetKernelWorkGroupInfo,
+  clWaitForEvents,
+  clGetEventInfo,
+  clRetainEvent,
+  clReleaseEvent,
+  clGetEventProfilingInfo,
+  clFlush,
+  clFinish,
+  clEnqueueReadBuffer,
+  clEnqueueWriteBuffer,
+  clEnqueueCopyBuffer,
+  clEnqueueReadImage,
+  clEnqueueWriteImage,
+  clEnqueueCopyImage,
+  clEnqueueCopyImageToBuffer,
+  clEnqueueCopyBufferToImage,
+  clEnqueueMapBuffer,
+  clEnqueueMapImage,
+  clEnqueueUnmapMemObject,
+  clEnqueueNDRangeKernel,
+  clEnqueueTask,
+  clEnqueueNativeKernel,
+  clEnqueueMarker,
+  clEnqueueWaitForEvents,
+  clEnqueueBarrier,
+  clGetExtensionFunctionAddress,
+  CL_GL_INTEROP(clCreateFromGLBuffer),
+  CL_GL_INTEROP(clCreateFromGLTexture2D),
+  CL_GL_INTEROP(clCreateFromGLTexture3D),
+  CL_GL_INTEROP(clCreateFromGLRenderbuffer),
+  CL_GL_INTEROP(clGetGLObjectInfo),
+  CL_GL_INTEROP(clGetGLTextureInfo),
+  CL_GL_INTEROP(clEnqueueAcquireGLObjects),
+  CL_GL_INTEROP(clEnqueueReleaseGLObjects),
+  CL_GL_INTEROP(clGetGLContextInfoKHR),
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  clSetEventCallback,
+  clCreateSubBuffer,
+  clSetMemObjectDestructorCallback,
+  clCreateUserEvent,
+  clSetUserEventStatus,
+  clEnqueueReadBufferRect,
+  clEnqueueWriteBufferRect,
+  clEnqueueCopyBufferRect,
+  CL_1_2_NOTYET(clCreateSubDevicesEXT),
+  CL_1_2_NOTYET(clRetainDeviceEXT),
+  CL_1_2_NOTYET(clReleaseDeviceEXT),
+#ifdef CL_VERSION_1_2
+  (void *) NULL,
+  clCreateSubDevices,
+  clRetainDevice,
+  clReleaseDevice,
+  clCreateImage,
+  clCreateProgramWithBuiltInKernels,
+  clCompileProgram,
+  clLinkProgram,
+  clUnloadPlatformCompiler,
+  clGetKernelArgInfo,
+  clEnqueueFillBuffer,
+  clEnqueueFillImage,
+  clEnqueueMigrateMemObjects,
+  clEnqueueMarkerWithWaitList,
+  clEnqueueBarrierWithWaitList,
+  clGetExtensionFunctionAddressForPlatform,
+  CL_GL_INTEROP(clCreateFromGLTexture),
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL,
+  (void *) NULL
+#endif
+};
+
diff --git a/src/cl_khr_icd.h b/src/cl_khr_icd.h
new file mode 100644
index 0000000..1e206b4
--- /dev/null
+++ b/src/cl_khr_icd.h
@@ -0,0 +1,34 @@
+/* 
+ * Copyright © 2013 Simon Richter
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __CL_KHR_ICD_H__
+#define __CL_KHR_ICD_H__
+
+#ifdef HAS_OCLIcd
+
+#define SET_ICD(dispatch) \
+  dispatch = &cl_khr_icd_dispatch;
+#define INIT_ICD(member)  .member = &cl_khr_icd_dispatch,
+#define DEFINE_ICD(member) struct _cl_icd_dispatch const *member;
+
+extern struct _cl_icd_dispatch const cl_khr_icd_dispatch;
+#else
+#define SET_ICD(dispatch)
+#define INIT_ICD(member)
+#define DEFINE_ICD(member)
+#endif
+
+#endif
diff --git a/src/cl_mem.c b/src/cl_mem.c
new file mode 100644
index 0000000..81c4d64
--- /dev/null
+++ b/src/cl_mem.c
@@ -0,0 +1,1903 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_mem.h"
+#include "cl_image.h"
+#include "cl_context.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_device_id.h"
+#include "cl_driver.h"
+#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+#include "cl_command_queue.h"
+
+#include "CL/cl.h"
+#include "CL/cl_intel.h"
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#define FIELD_SIZE(CASE,TYPE)               \
+  case JOIN(CL_,CASE):                      \
+    if(param_value_size_ret)                \
+      *param_value_size_ret = sizeof(TYPE); \
+    if(!param_value)                        \
+      return CL_SUCCESS;                    \
+    if(param_value_size < sizeof(TYPE))     \
+      return CL_INVALID_VALUE;              \
+    break;
+
+#define MAX_TILING_SIZE                             128 * MB
+
+static cl_mem_object_type
+cl_get_mem_object_type(cl_mem mem)
+{
+  switch (mem->type) {
+    case CL_MEM_BUFFER_TYPE:
+    case CL_MEM_SUBBUFFER_TYPE:
+      return CL_MEM_OBJECT_BUFFER;
+    case CL_MEM_IMAGE_TYPE:
+    case CL_MEM_GL_IMAGE_TYPE:
+    {
+      struct _cl_mem_image *image = cl_mem_image(mem);
+      return image->image_type;
+    }
+    default:
+      return CL_MEM_OBJECT_BUFFER;
+  }
+}
+
+LOCAL cl_int
+cl_get_mem_object_info(cl_mem mem,
+                cl_mem_info param_name,
+                size_t param_value_size,
+                void *param_value,
+                size_t *param_value_size_ret)
+{
+  switch(param_name)
+  {
+    FIELD_SIZE(MEM_TYPE, cl_mem_object_type);
+    FIELD_SIZE(MEM_FLAGS, cl_mem_flags);
+    FIELD_SIZE(MEM_SIZE, size_t);
+    FIELD_SIZE(MEM_HOST_PTR, void *);
+    FIELD_SIZE(MEM_MAP_COUNT, cl_uint);
+    FIELD_SIZE(MEM_REFERENCE_COUNT, cl_uint);
+    FIELD_SIZE(MEM_CONTEXT, cl_context);
+    FIELD_SIZE(MEM_ASSOCIATED_MEMOBJECT, cl_mem);
+    FIELD_SIZE(MEM_OFFSET, size_t);
+  default:
+    return CL_INVALID_VALUE;
+  }
+
+  switch(param_name)
+  {
+  case CL_MEM_TYPE:
+    *((cl_mem_object_type *)param_value) = cl_get_mem_object_type(mem);
+    break;
+  case CL_MEM_FLAGS:
+    *((cl_mem_flags *)param_value) = mem->flags;
+    break;
+  case CL_MEM_SIZE:
+    *((size_t *)param_value) = mem->size;
+    break;
+  case CL_MEM_HOST_PTR:
+    if(mem->type == CL_MEM_IMAGE_TYPE) {
+      *((size_t *)param_value) = (size_t)mem->host_ptr;
+    } else {
+      struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
+      *((size_t *)param_value) = (size_t)mem->host_ptr + buf->sub_offset;
+    }
+    break;
+  case CL_MEM_MAP_COUNT:
+    *((cl_uint *)param_value) = mem->map_ref;
+    break;
+  case CL_MEM_REFERENCE_COUNT:
+    *((cl_uint *)param_value) = mem->ref_n;
+    break;
+  case CL_MEM_CONTEXT:
+    *((cl_context *)param_value) = mem->ctx;
+    break;
+  case CL_MEM_ASSOCIATED_MEMOBJECT:
+    if(mem->type != CL_MEM_SUBBUFFER_TYPE) {
+      *((cl_mem *)param_value) = NULL;
+    } else {
+      struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
+      *((cl_mem *)param_value) = (cl_mem)(buf->parent);
+    }
+    break;
+  case CL_MEM_OFFSET:
+    if(mem->type != CL_MEM_SUBBUFFER_TYPE) {
+      *((size_t *)param_value) = 0;
+    } else {
+      struct _cl_mem_buffer* buf = (struct _cl_mem_buffer*)mem;
+      *((size_t *)param_value) = buf->sub_offset;
+    }
+    break;
+  }
+
+  return CL_SUCCESS;
+}
+
+#define IS_1D(image) (image->image_type == CL_MEM_OBJECT_IMAGE1D ||        \
+                      image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||  \
+                      image->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+
+#define IS_2D(image) (image->image_type == CL_MEM_OBJECT_IMAGE2D ||        \
+                      image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+
+#define IS_3D(image) (image->image_type == CL_MEM_OBJECT_IMAGE3D)
+
+#define IS_ARRAY(image) (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || \
+                         image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+
+LOCAL cl_int
+cl_get_image_info(cl_mem mem,
+                  cl_image_info param_name,
+                  size_t param_value_size,
+                  void *param_value,
+                  size_t *param_value_size_ret)
+{
+  int err;
+  CHECK_IMAGE(mem, image);
+
+  switch(param_name)
+  {
+    FIELD_SIZE(IMAGE_FORMAT, cl_image_format);
+    FIELD_SIZE(IMAGE_ELEMENT_SIZE, size_t);
+    FIELD_SIZE(IMAGE_ROW_PITCH, size_t);
+    FIELD_SIZE(IMAGE_SLICE_PITCH, size_t);
+    FIELD_SIZE(IMAGE_WIDTH, size_t);
+    FIELD_SIZE(IMAGE_HEIGHT, size_t);
+    FIELD_SIZE(IMAGE_DEPTH, size_t);
+    FIELD_SIZE(IMAGE_ARRAY_SIZE, size_t);
+    FIELD_SIZE(IMAGE_BUFFER, cl_mem);
+    FIELD_SIZE(IMAGE_NUM_MIP_LEVELS, cl_uint);
+    FIELD_SIZE(IMAGE_NUM_SAMPLES, cl_uint);
+  default:
+    return CL_INVALID_VALUE;
+  }
+
+  switch(param_name)
+  {
+  case CL_IMAGE_FORMAT:
+    *(cl_image_format *)param_value = image->fmt;
+    break;
+  case CL_IMAGE_ELEMENT_SIZE:
+    *(size_t *)param_value = image->bpp;
+    break;
+  case CL_IMAGE_ROW_PITCH:
+    *(size_t *)param_value = image->row_pitch;
+    break;
+  case CL_IMAGE_SLICE_PITCH:
+    *(size_t *)param_value = image->slice_pitch;
+    break;
+  case CL_IMAGE_WIDTH:
+    *(size_t *)param_value = image->w;
+    break;
+  case CL_IMAGE_HEIGHT:
+    *(size_t *)param_value = IS_1D(image) ? 0 : image->h;
+    break;
+  case CL_IMAGE_DEPTH:
+    *(size_t *)param_value = IS_3D(image) ? image->depth : 0;
+    break;
+  case CL_IMAGE_ARRAY_SIZE:
+    *(size_t *)param_value = IS_ARRAY(image) ? image->depth : 0;
+    break;
+  case CL_IMAGE_BUFFER:
+    *(cl_mem *)param_value = image->buffer_1d;
+    break;
+  case CL_IMAGE_NUM_MIP_LEVELS:
+  case CL_IMAGE_NUM_SAMPLES:
+    *(cl_mem *)param_value = 0;
+    break;
+  }
+
+  return CL_SUCCESS;
+
+error:
+    return err;
+}
+
+#undef FIELD_SIZE
+
+LOCAL cl_mem
+cl_mem_allocate(enum cl_mem_type type,
+                cl_context ctx,
+                cl_mem_flags flags,
+                size_t sz,
+                cl_int is_tiled,
+                cl_int *errcode)
+{
+  cl_buffer_mgr bufmgr = NULL;
+  cl_mem mem = NULL;
+  cl_int err = CL_SUCCESS;
+  size_t alignment = 64;
+
+  assert(ctx);
+
+  /* Allocate and inialize the structure itself */
+  if (type == CL_MEM_IMAGE_TYPE) {
+    struct _cl_mem_image *image = NULL;
+    TRY_ALLOC (image, CALLOC(struct _cl_mem_image));
+    mem = &image->base;
+  } else if (type == CL_MEM_GL_IMAGE_TYPE ) {
+    struct _cl_mem_gl_image *gl_image = NULL;
+    TRY_ALLOC (gl_image, CALLOC(struct _cl_mem_gl_image));
+    mem = &gl_image->base.base;
+  } else {
+    struct _cl_mem_buffer *buffer = NULL;
+    TRY_ALLOC (buffer, CALLOC(struct _cl_mem_buffer));
+    mem = &buffer->base;
+  }
+  mem->type = type;
+  SET_ICD(mem->dispatch)
+  mem->ref_n = 1;
+  mem->magic = CL_MAGIC_MEM_HEADER;
+  mem->flags = flags;
+
+  if (sz != 0) {
+    /* Pinning will require stricter alignment rules */
+    if ((flags & CL_MEM_PINNABLE) || is_tiled)
+      alignment = 4096;
+
+    /* Allocate space in memory */
+    bufmgr = cl_context_get_bufmgr(ctx);
+    assert(bufmgr);
+    mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
+    if (UNLIKELY(mem->bo == NULL)) {
+      err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+      goto error;
+    }
+    mem->size = sz;
+  }
+
+  cl_context_add_ref(ctx);
+  mem->ctx = ctx;
+    /* Append the buffer in the context buffer list */
+  pthread_mutex_lock(&ctx->buffer_lock);
+  mem->next = ctx->buffers;
+  if (ctx->buffers != NULL)
+    ctx->buffers->prev = mem;
+  ctx->buffers = mem;
+  pthread_mutex_unlock(&ctx->buffer_lock);
+
+exit:
+  if (errcode)
+    *errcode = err;
+  return mem;
+error:
+  cl_mem_delete(mem);
+  mem = NULL;
+  goto exit;
+
+}
+
+LOCAL cl_int
+is_valid_mem(cl_mem mem, cl_mem buffers)
+{
+  cl_mem tmp = buffers;
+  while(tmp){
+    if(mem == tmp){
+      if (UNLIKELY(mem->magic != CL_MAGIC_MEM_HEADER))
+        return CL_INVALID_MEM_OBJECT;
+      return CL_SUCCESS;
+    }
+    tmp = tmp->next;
+  }
+  return CL_INVALID_MEM_OBJECT;
+}
+
+LOCAL cl_mem
+cl_mem_new_buffer(cl_context ctx,
+                  cl_mem_flags flags,
+                  size_t sz,
+                  void *data,
+                  cl_int *errcode_ret)
+{
+  /* Possible mem type combination:
+       CL_MEM_ALLOC_HOST_PTR
+       CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR
+       CL_MEM_USE_HOST_PTR
+       CL_MEM_COPY_HOST_PTR   */
+
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+  cl_ulong max_mem_size;
+
+  if (UNLIKELY(sz == 0)) {
+    err = CL_INVALID_BUFFER_SIZE;
+    goto error;
+  }
+
+  if (UNLIKELY(((flags & CL_MEM_READ_WRITE)
+                  && (flags & (CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY)))
+		      || ((flags & CL_MEM_READ_ONLY) && (flags & (CL_MEM_WRITE_ONLY)))
+              || ((flags & CL_MEM_ALLOC_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR))
+              || ((flags & CL_MEM_COPY_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR))
+              || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS))
+              || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_WRITE_ONLY))
+              || ((flags & CL_MEM_HOST_WRITE_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS))
+              || ((flags & (~(CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY
+                        | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR
+                        | CL_MEM_USE_HOST_PTR | CL_MEM_HOST_WRITE_ONLY
+                        | CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS))) != 0))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  /* This flag is valid only if host_ptr is not NULL */
+  if (UNLIKELY((((flags & CL_MEM_COPY_HOST_PTR) ||
+                (flags & CL_MEM_USE_HOST_PTR)) &&
+                data == NULL))
+               || (!(flags & (CL_MEM_COPY_HOST_PTR
+                            |CL_MEM_USE_HOST_PTR))
+                    && (data != NULL))) {
+    err = CL_INVALID_HOST_PTR;
+    goto error;
+  }
+
+  /* CL_MEM_ALLOC_HOST_PTR and CL_MEM_USE_HOST_PTR
+     are mutually exclusive. */
+  if (UNLIKELY(flags & CL_MEM_ALLOC_HOST_PTR &&
+               flags & CL_MEM_USE_HOST_PTR)) {
+    err = CL_INVALID_HOST_PTR;
+    goto error;
+  }
+
+  /* CL_MEM_COPY_HOST_PTR and CL_MEM_USE_HOST_PTR
+     are mutually exclusive. */
+  if (UNLIKELY(flags & CL_MEM_COPY_HOST_PTR &&
+               flags & CL_MEM_USE_HOST_PTR)) {
+    err = CL_INVALID_HOST_PTR;
+    goto error;
+  }
+
+  if ((err = cl_get_device_info(ctx->device,
+                                CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                                sizeof(max_mem_size),
+                                &max_mem_size,
+                                NULL)) != CL_SUCCESS) {
+    goto error;
+  }
+
+  if (UNLIKELY(sz > max_mem_size)) {
+    err = CL_INVALID_BUFFER_SIZE;
+    goto error;
+  }
+
+  /* HSW: Byte scattered Read/Write has limitation that
+     the buffer size must be a multiple of 4 bytes. */
+  sz = ALIGN(sz, 4);
+
+  /* Create the buffer in video memory */
+  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, &err);
+  if (mem == NULL || err != CL_SUCCESS)
+    goto error;
+
+  /* Copy the data if required */
+  if (flags & CL_MEM_COPY_HOST_PTR || flags & CL_MEM_USE_HOST_PTR)
+    cl_buffer_subdata(mem->bo, 0, sz, data);
+
+  if (flags & CL_MEM_USE_HOST_PTR || flags & CL_MEM_COPY_HOST_PTR)
+    mem->host_ptr = data;
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+error:
+  cl_mem_delete(mem);
+  mem = NULL;
+  goto exit;
+}
+
+LOCAL cl_mem
+cl_mem_new_sub_buffer(cl_mem buffer,
+                      cl_mem_flags flags,
+                      cl_buffer_create_type create_type,
+                      const void *create_info,
+                      cl_int *errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+  struct _cl_mem_buffer *sub_buf = NULL;
+
+  if (buffer->type != CL_MEM_BUFFER_TYPE) {
+    err = CL_INVALID_MEM_OBJECT;
+    goto error;
+  }
+
+  if (flags && (((buffer->flags & CL_MEM_WRITE_ONLY) && (flags & (CL_MEM_READ_WRITE|CL_MEM_READ_ONLY)))
+          || ((buffer->flags & CL_MEM_READ_ONLY) && (flags & (CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY)))
+          || (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR))
+          || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS))
+          || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_WRITE_ONLY))
+          || ((flags & CL_MEM_HOST_WRITE_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS)))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if((flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_READ_WRITE)) == 0) {
+    flags |= buffer->flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_READ_WRITE);
+  }
+  flags |= buffer->flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR);
+  if((flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) == 0) {
+    flags |= buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS);
+  }
+
+  if (create_type != CL_BUFFER_CREATE_TYPE_REGION) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (!create_info) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  cl_buffer_region *info = (cl_buffer_region *)create_info;
+
+  if (!info->size) {
+    err = CL_INVALID_BUFFER_SIZE;
+    goto error;
+  }
+
+  if (info->origin > buffer->size || info->origin + info->size > buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (info->origin & (buffer->ctx->device->mem_base_addr_align / 8 - 1)) {
+    err = CL_MISALIGNED_SUB_BUFFER_OFFSET;
+    goto error;
+  }
+
+  /* Now create the sub buffer and link it to the buffer. */
+  TRY_ALLOC (sub_buf, CALLOC(struct _cl_mem_buffer));
+  mem = &sub_buf->base;
+  mem->type = CL_MEM_SUBBUFFER_TYPE;
+  SET_ICD(mem->dispatch)
+  mem->ref_n = 1;
+  mem->magic = CL_MAGIC_MEM_HEADER;
+  mem->flags = flags;
+  sub_buf->parent = (struct _cl_mem_buffer*)buffer;
+
+  cl_mem_add_ref(buffer);
+  /* Append the buffer in the parent buffer list */
+  pthread_mutex_lock(&((struct _cl_mem_buffer*)buffer)->sub_lock);
+  sub_buf->sub_next = ((struct _cl_mem_buffer*)buffer)->subs;
+  if (((struct _cl_mem_buffer*)buffer)->subs != NULL)
+    ((struct _cl_mem_buffer*)buffer)->subs->sub_prev = sub_buf;
+  ((struct _cl_mem_buffer*)buffer)->subs = sub_buf;
+  pthread_mutex_unlock(&((struct _cl_mem_buffer*)buffer)->sub_lock);
+
+  mem->bo = buffer->bo;
+  mem->size = info->size;
+  sub_buf->sub_offset = info->origin;
+  if (buffer->flags & CL_MEM_USE_HOST_PTR || buffer->flags & CL_MEM_COPY_HOST_PTR) {
+    mem->host_ptr = buffer->host_ptr;
+  }
+
+  cl_context_add_ref(buffer->ctx);
+  mem->ctx = buffer->ctx;
+  /* Append the buffer in the context buffer list */
+  pthread_mutex_lock(&buffer->ctx->buffer_lock);
+  mem->next = buffer->ctx->buffers;
+  if (buffer->ctx->buffers != NULL)
+    buffer->ctx->buffers->prev = mem;
+  buffer->ctx->buffers = mem;
+  pthread_mutex_unlock(&buffer->ctx->buffer_lock);
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+error:
+  cl_mem_delete(mem);
+  mem = NULL;
+  goto exit;
+}
+
+void cl_mem_replace_buffer(cl_mem buffer, cl_buffer new_bo)
+{
+  cl_buffer_unreference(buffer->bo);
+  buffer->bo = new_bo;
+  cl_buffer_reference(new_bo);
+  if (buffer->type != CL_MEM_SUBBUFFER_TYPE)
+    return;
+
+  struct _cl_mem_buffer *it = ((struct _cl_mem_buffer*)buffer)->sub_next;
+  for( ; it != (struct _cl_mem_buffer*)buffer; it = it->sub_next)
+  {
+    cl_buffer_unreference(it->base.bo);
+    it->base.bo = new_bo;
+    cl_buffer_reference(new_bo);
+  }
+}
+
+void
+cl_mem_copy_image_region(const size_t *origin, const size_t *region,
+                         void *dst, size_t dst_row_pitch, size_t dst_slice_pitch,
+                         const void *src, size_t src_row_pitch, size_t src_slice_pitch,
+                         const struct _cl_mem_image *image, cl_bool offset_dst, cl_bool offset_src)
+{
+  if(offset_dst) {
+    size_t dst_offset = image->bpp * origin[0] + dst_row_pitch * origin[1] + dst_slice_pitch * origin[2];
+    dst = (char*)dst + dst_offset;
+  }
+  if(offset_src) {
+    size_t src_offset = image->bpp * origin[0] + src_row_pitch * origin[1] + src_slice_pitch * origin[2];
+    src = (char*)src + src_offset;
+  }
+  if (!origin[0] && region[0] == image->w && dst_row_pitch == src_row_pitch &&
+      (region[2] == 1 || (!origin[1] && region[1] == image->h && dst_slice_pitch == src_slice_pitch)))
+  {
+    memcpy(dst, src, region[2] == 1 ? src_row_pitch*region[1] : src_slice_pitch*region[2]);
+  }
+  else {
+    cl_uint y, z;
+    for (z = 0; z < region[2]; z++) {
+      const char* src_ptr = src;
+      char* dst_ptr = dst;
+      for (y = 0; y < region[1]; y++) {
+        memcpy(dst_ptr, src_ptr, image->bpp*region[0]);
+        src_ptr += src_row_pitch;
+        dst_ptr += dst_row_pitch;
+      }
+      src = (char*)src + src_slice_pitch;
+      dst = (char*)dst + dst_slice_pitch;
+    }
+  }
+}
+
+void
+cl_mem_copy_image_to_image(const size_t *dst_origin,const size_t *src_origin, const size_t *region,
+                           const struct _cl_mem_image *dst_image, const struct _cl_mem_image *src_image)
+{
+  char* dst= cl_mem_map_auto((cl_mem)dst_image);
+  char* src= cl_mem_map_auto((cl_mem)src_image);
+  size_t dst_offset = dst_image->bpp * dst_origin[0] + dst_image->row_pitch * dst_origin[1] + dst_image->slice_pitch * dst_origin[2];
+  size_t src_offset = src_image->bpp * src_origin[0] + src_image->row_pitch * src_origin[1] + src_image->slice_pitch * src_origin[2];
+  dst= (char*)dst+ dst_offset;
+  src= (char*)src+ src_offset;
+  cl_uint y, z;
+  for (z = 0; z < region[2]; z++) {
+    const char* src_ptr = src;
+    char* dst_ptr = dst;
+    for (y = 0; y < region[1]; y++) {
+      memcpy(dst_ptr, src_ptr, src_image->bpp*region[0]);
+      src_ptr += src_image->row_pitch;
+      dst_ptr += dst_image->row_pitch;
+    }
+    src = (char*)src + src_image->slice_pitch;
+    dst = (char*)dst + dst_image->slice_pitch;
+  }
+
+  cl_mem_unmap_auto((cl_mem)src_image);
+  cl_mem_unmap_auto((cl_mem)dst_image);
+
+}
+
+static void
+cl_mem_copy_image(struct _cl_mem_image *image,
+		  size_t row_pitch,
+		  size_t slice_pitch,
+		  void* host_ptr)
+{
+  char* dst_ptr = cl_mem_map_auto((cl_mem)image);
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {image->w, image->h, image->depth};
+
+  cl_mem_copy_image_region(origin, region, dst_ptr, image->row_pitch, image->slice_pitch,
+                           host_ptr, row_pitch, slice_pitch, image, CL_FALSE, CL_FALSE); //offset is 0
+  cl_mem_unmap_auto((cl_mem)image);
+}
+
+static const uint32_t tile_sz = 4096; /* 4KB per tile */
+static const uint32_t tilex_w = 512;  /* tileX width in bytes */
+static const uint32_t tilex_h = 8;    /* tileX height in number of rows */
+static const uint32_t tiley_w = 128;  /* tileY width in bytes */
+static const uint32_t tiley_h = 32;   /* tileY height in number of rows */
+static const uint32_t valign = 2;     /* vertical alignment is 2. */
+
+cl_image_tiling_t cl_get_default_tiling(void)
+{
+  static int initialized = 0;
+  static cl_image_tiling_t tiling = CL_TILE_X;
+  if (!initialized) {
+    char *tilingStr = getenv("OCL_TILING");
+    if (tilingStr != NULL) {
+      switch (tilingStr[0]) {
+        case '0': tiling = CL_NO_TILE; break;
+        case '1': tiling = CL_TILE_X; break;
+        case '2': tiling = CL_TILE_Y; break;
+        default:
+          break;
+      }
+    }
+    initialized = 1;
+  }
+
+  return tiling;
+}
+
+static cl_mem
+_cl_mem_new_image(cl_context ctx,
+                  cl_mem_flags flags,
+                  const cl_image_format *fmt,
+                  const cl_mem_object_type orig_image_type,
+                  size_t w,
+                  size_t h,
+                  size_t depth,
+                  size_t pitch,
+                  size_t slice_pitch,
+                  void *data,
+                  cl_int *errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+  cl_mem_object_type image_type = orig_image_type;
+  uint32_t bpp = 0, intel_fmt = INTEL_UNSUPPORTED_FORMAT;
+  size_t sz = 0, aligned_pitch = 0, aligned_slice_pitch = 0, aligned_h = 0;
+  cl_image_tiling_t tiling = CL_NO_TILE;
+
+  /* Check flags consistency */
+  if (UNLIKELY((flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR)) && data == NULL)) {
+    err = CL_INVALID_HOST_PTR;
+    goto error;
+  }
+
+  /* Get the size of each pixel */
+  if (UNLIKELY((err = cl_image_byte_per_pixel(fmt, &bpp)) != CL_SUCCESS))
+    goto error;
+
+  /* Only a sub-set of the formats are supported */
+  intel_fmt = cl_image_get_intel_format(fmt);
+  if (UNLIKELY(intel_fmt == INTEL_UNSUPPORTED_FORMAT)) {
+    err = CL_IMAGE_FORMAT_NOT_SUPPORTED;
+    goto error;
+  }
+
+  /* See if the user parameters match */
+#define DO_IMAGE_ERROR            \
+  do {                            \
+    err = CL_INVALID_IMAGE_SIZE;  \
+    goto error;                   \
+  } while (0);
+
+  if (UNLIKELY(w == 0)) DO_IMAGE_ERROR;
+  if (UNLIKELY(h == 0 && (image_type != CL_MEM_OBJECT_IMAGE1D &&
+      image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY &&
+      image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER)))
+    DO_IMAGE_ERROR;
+
+  if (image_type == CL_MEM_OBJECT_IMAGE1D ||
+      image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
+    size_t min_pitch = bpp * w;
+    if (data && pitch == 0)
+      pitch = min_pitch;
+
+    h = 1;
+    depth = 1;
+    if (UNLIKELY(w > ctx->device->image2d_max_width)) DO_IMAGE_ERROR;
+    if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
+    if (UNLIKELY(data && (slice_pitch % pitch != 0))) DO_IMAGE_ERROR;
+    if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
+    if (UNLIKELY(!data && slice_pitch != 0)) DO_IMAGE_ERROR;
+    tiling = CL_NO_TILE;
+  } else if (image_type == CL_MEM_OBJECT_IMAGE2D) {
+    size_t min_pitch = bpp * w;
+    if (data && pitch == 0)
+      pitch = min_pitch;
+    if (UNLIKELY(w > ctx->device->image2d_max_width)) DO_IMAGE_ERROR;
+    if (UNLIKELY(h > ctx->device->image2d_max_height)) DO_IMAGE_ERROR;
+    if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
+    if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
+
+    /* Pick up tiling mode (we do only linear on SNB) */
+    if (cl_driver_get_ver(ctx->drv) != 6)
+      tiling = cl_get_default_tiling();
+
+    depth = 1;
+  } else if (image_type == CL_MEM_OBJECT_IMAGE3D ||
+             image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||
+             image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+    if (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+      h = 1;
+      tiling = CL_NO_TILE;
+    } else if (cl_driver_get_ver(ctx->drv) != 6)
+      tiling = cl_get_default_tiling();
+
+    size_t min_pitch = bpp * w;
+    if (data && pitch == 0)
+      pitch = min_pitch;
+    size_t min_slice_pitch = pitch * h;
+    if (data && slice_pitch == 0)
+      slice_pitch = min_slice_pitch;
+    if (UNLIKELY(w > ctx->device->image3d_max_width)) DO_IMAGE_ERROR;
+    if (UNLIKELY(h > ctx->device->image3d_max_height)) DO_IMAGE_ERROR;
+    if (image_type == CL_MEM_OBJECT_IMAGE3D &&
+       (UNLIKELY(depth > ctx->device->image3d_max_depth))) DO_IMAGE_ERROR
+    else if (UNLIKELY(depth > ctx->device->image_max_array_size)) DO_IMAGE_ERROR;
+    if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR;
+    if (UNLIKELY(data && min_slice_pitch > slice_pitch)) DO_IMAGE_ERROR;
+    if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR;
+    if (UNLIKELY(!data && slice_pitch != 0)) DO_IMAGE_ERROR;
+
+  } else
+    assert(0);
+
+#undef DO_IMAGE_ERROR
+
+  /* Tiling requires to align both pitch and height */
+  if (tiling == CL_NO_TILE) {
+    aligned_pitch = w * bpp;
+    aligned_h  = ALIGN(h, valign);
+  } else if (tiling == CL_TILE_X) {
+    aligned_pitch = ALIGN(w * bpp, tilex_w);
+    aligned_h     = ALIGN(h, tilex_h);
+  } else if (tiling == CL_TILE_Y) {
+    aligned_pitch = ALIGN(w * bpp, tiley_w);
+    aligned_h     = ALIGN(h, tiley_h);
+  }
+
+  sz = aligned_pitch * aligned_h * depth;
+
+  /* If sz is large than 128MB, map gtt may fail in some system.
+     Because there is no obviours performance drop, disable tiling. */
+  if(tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) {
+    tiling = CL_NO_TILE;
+    aligned_pitch = w * bpp;
+    aligned_h     = h;
+    sz = aligned_pitch * aligned_h * depth;
+  }
+
+  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, &err);
+  if (mem == NULL || err != CL_SUCCESS)
+    goto error;
+
+  cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
+  if (image_type == CL_MEM_OBJECT_IMAGE1D ||
+      image_type == CL_MEM_OBJECT_IMAGE2D ||
+      image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+    aligned_slice_pitch = 0;
+  else
+    aligned_slice_pitch = aligned_pitch * ALIGN(h, 2);
+
+  cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt,
+                    intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling,
+                    0, 0, 0);
+
+  /* Copy the data if required */
+  if (flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR)) {
+    cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data);
+    if (flags & CL_MEM_USE_HOST_PTR) {
+      mem->host_ptr = data;
+      cl_mem_image(mem)->host_row_pitch = pitch;
+      cl_mem_image(mem)->host_slice_pitch = slice_pitch;
+    }
+  }
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+error:
+  cl_mem_delete(mem);
+  mem = NULL;
+  goto exit;
+}
+
+static cl_mem
+_cl_mem_new_image_from_buffer(cl_context ctx,
+                              cl_mem_flags flags,
+                              const cl_image_format* image_format,
+                              const cl_image_desc *image_desc,
+                              cl_int *errcode_ret)
+{
+  cl_mem image = NULL;
+  cl_mem buffer = image_desc->buffer;
+  cl_int err = CL_SUCCESS;
+  *errcode_ret = err;
+  cl_ulong max_size;
+  cl_mem_flags merged_flags;
+  uint32_t bpp;
+  uint32_t intel_fmt = INTEL_UNSUPPORTED_FORMAT;
+  size_t offset = 0;
+
+  /* Get the size of each pixel */
+  if (UNLIKELY((err = cl_image_byte_per_pixel(image_format, &bpp)) != CL_SUCCESS))
+    goto error;
+
+  /* Only a sub-set of the formats are supported */
+  intel_fmt = cl_image_get_intel_format(image_format);
+  if (UNLIKELY(intel_fmt == INTEL_UNSUPPORTED_FORMAT)) {
+    err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
+    goto error;
+  }
+
+  if (!buffer) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+
+  if (flags & (CL_MEM_USE_HOST_PTR|CL_MEM_ALLOC_HOST_PTR|CL_MEM_COPY_HOST_PTR)) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+
+  /* access check. */
+  if ((buffer->flags & CL_MEM_WRITE_ONLY) &&
+      (flags & (CL_MEM_READ_WRITE|CL_MEM_READ_ONLY))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if ((buffer->flags & CL_MEM_READ_ONLY) &&
+      (flags & (CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if ((buffer->flags & CL_MEM_HOST_WRITE_ONLY) &&
+      (flags & CL_MEM_HOST_READ_ONLY)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if ((buffer->flags & CL_MEM_HOST_READ_ONLY) &&
+      (flags & CL_MEM_HOST_WRITE_ONLY)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+  if ((buffer->flags & CL_MEM_HOST_NO_ACCESS) &&
+      (flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY))) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if ((err = cl_get_device_info(ctx->device,
+                                CL_DEVICE_IMAGE_MAX_BUFFER_SIZE,
+                                sizeof(max_size),
+                                &max_size,
+                                NULL)) != CL_SUCCESS) {
+    goto error;
+  }
+
+  if (image_desc->image_width > max_size) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+
+  if (image_desc->image_width*bpp > buffer->size) {
+    err = CL_INVALID_IMAGE_DESCRIPTOR;
+    goto error;
+  }
+
+  merged_flags = buffer->flags;
+  if (flags & (CL_MEM_READ_WRITE|CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY)) {
+    merged_flags &= ~(CL_MEM_READ_WRITE|CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY);
+    merged_flags |= flags & (CL_MEM_READ_WRITE|CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY);
+  }
+  if (flags & (CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS)) {
+    merged_flags &= ~(CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS);
+    merged_flags |= flags & (CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS);
+  }
+  struct _cl_mem_buffer *mem_buffer = (struct _cl_mem_buffer*)buffer;
+  if (buffer->type == CL_MEM_SUBBUFFER_TYPE) {
+    offset = ((struct _cl_mem_buffer *)buffer)->sub_offset;
+    mem_buffer = mem_buffer->parent;
+  }
+  /* Get the size of each pixel */
+  if (UNLIKELY((err = cl_image_byte_per_pixel(image_format, &bpp)) != CL_SUCCESS))
+    goto error;
+
+  // Per bspec, a image should has a at least 2 line vertical alignment,
+  // thus we can't simply attach a buffer to a 1d image surface which has the same size.
+  // We have to create a new image, and copy the buffer data to this new image.
+  // And replace all the buffer object's reference to this image.
+  image = _cl_mem_new_image(ctx, flags, image_format, image_desc->image_type,
+                    mem_buffer->base.size / bpp, 0, 0, 0, 0, NULL, errcode_ret);
+  if (image == NULL)
+    return NULL;
+  void *src = cl_mem_map(buffer);
+  void *dst = cl_mem_map(image);
+  //
+  // FIXME, we could use copy buffer to image to do this on GPU latter.
+  // currently the copy buffer to image function doesn't support 1D image.
+  // 
+  // There is a potential risk that this buffer was mapped and the caller
+  // still hold the pointer and want to access it again. This scenario is
+  // not explicitly forbidden in the spec, although it should not be permitted.
+  memcpy(dst, src, mem_buffer->base.size);
+  cl_mem_unmap(buffer);
+  cl_mem_unmap(image);
+
+  if (err != 0)
+    goto error;
+ 
+  // Now replace buffer's bo to this new bo, need to take care of sub buffer
+  // case. 
+  cl_mem_replace_buffer(buffer, image->bo);
+  /* Now point to the right offset if buffer is a SUB_BUFFER. */
+  if (buffer->flags & CL_MEM_USE_HOST_PTR)
+    image->host_ptr = buffer->host_ptr + offset;
+  cl_mem_image(image)->offset = offset;
+  cl_mem_image(image)->w = image_desc->image_width;
+  cl_mem_add_ref(buffer);
+  cl_mem_image(image)->buffer_1d = buffer;
+  return image;
+
+error:
+  if (image)
+    cl_mem_delete(image);
+  image = NULL;
+  *errcode_ret = err;
+  return image;
+}
+
+LOCAL cl_mem
+cl_mem_new_image(cl_context context,
+                 cl_mem_flags flags,
+                 const cl_image_format *image_format,
+                 const cl_image_desc *image_desc,
+                 void *host_ptr,
+                 cl_int *errcode_ret)
+{
+  switch (image_desc->image_type) {
+  case CL_MEM_OBJECT_IMAGE1D:
+  case CL_MEM_OBJECT_IMAGE2D:
+  case CL_MEM_OBJECT_IMAGE3D:
+    return _cl_mem_new_image(context, flags, image_format, image_desc->image_type,
+                             image_desc->image_width, image_desc->image_height, image_desc->image_depth,
+                             image_desc->image_row_pitch, image_desc->image_slice_pitch,
+                             host_ptr, errcode_ret);
+  case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+  case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+    return _cl_mem_new_image(context, flags, image_format, image_desc->image_type,
+                             image_desc->image_width, image_desc->image_height, image_desc->image_array_size,
+                             image_desc->image_row_pitch, image_desc->image_slice_pitch,
+                             host_ptr, errcode_ret);
+  case CL_MEM_OBJECT_IMAGE1D_BUFFER:
+    return _cl_mem_new_image_from_buffer(context, flags, image_format,
+                                         image_desc, errcode_ret);
+    break;
+  case CL_MEM_OBJECT_BUFFER:
+  default:
+    assert(0);
+  }
+  return NULL;
+}
+
+LOCAL void
+cl_mem_delete(cl_mem mem)
+{
+  cl_int i;
+  if (UNLIKELY(mem == NULL))
+    return;
+  if (atomic_dec(&mem->ref_n) > 1)
+    return;
+#ifdef HAS_EGL
+  if (UNLIKELY(IS_GL_IMAGE(mem))) {
+     cl_mem_gl_delete(cl_mem_gl_image(mem));
+  }
+#endif
+
+  /* iff we are a image, delete the 1d buffer if has. */
+  if (IS_IMAGE(mem)) {
+    if (cl_mem_image(mem)->buffer_1d) {
+      assert(cl_mem_image(mem)->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER);
+      cl_mem_delete(cl_mem_image(mem)->buffer_1d);
+      cl_mem_image(mem)->buffer_1d = NULL;
+    }
+  }
+
+  /* Remove it from the list */
+  assert(mem->ctx);
+  pthread_mutex_lock(&mem->ctx->buffer_lock);
+    if (mem->prev)
+      mem->prev->next = mem->next;
+    if (mem->next)
+      mem->next->prev = mem->prev;
+    if (mem->ctx->buffers == mem)
+      mem->ctx->buffers = mem->next;
+  pthread_mutex_unlock(&mem->ctx->buffer_lock);
+  cl_context_delete(mem->ctx);
+
+  /* Someone still mapped, unmap */
+  if(mem->map_ref > 0) {
+    assert(mem->mapped_ptr);
+    for(i=0; i<mem->mapped_ptr_sz; i++) {
+      if(mem->mapped_ptr[i].ptr != NULL) {
+        mem->map_ref--;
+        cl_mem_unmap_auto(mem);
+      }
+    }
+    assert(mem->map_ref == 0);
+  }
+
+  if (mem->mapped_ptr)
+    free(mem->mapped_ptr);
+
+  if (mem->dstr_cb) {
+    cl_mem_dstr_cb *cb = mem->dstr_cb;
+    while (mem->dstr_cb) {
+      cb = mem->dstr_cb;
+      cb->pfn_notify(mem, cb->user_data);
+      mem->dstr_cb = cb->next;
+      free(cb);
+    }
+  }
+
+  /* Iff we are sub, do nothing for bo release. */
+  if (mem->type == CL_MEM_SUBBUFFER_TYPE) {
+    struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+    /* Remove it from the parent's list */
+    assert(buffer->parent);
+    pthread_mutex_lock(&buffer->parent->sub_lock);
+    if (buffer->sub_prev)
+      buffer->sub_prev->sub_next = buffer->sub_next;
+    if (buffer->sub_next)
+      buffer->sub_next->sub_prev = buffer->sub_prev;
+    if (buffer->parent->subs == buffer)
+      buffer->parent->subs = buffer->sub_next;
+    pthread_mutex_unlock(&buffer->parent->sub_lock);
+    cl_mem_delete((cl_mem )(buffer->parent));
+  } else if (LIKELY(mem->bo != NULL)) {
+    cl_buffer_unreference(mem->bo);
+  }
+
+  cl_free(mem);
+}
+
+LOCAL void
+cl_mem_add_ref(cl_mem mem)
+{
+  assert(mem);
+  atomic_inc(&mem->ref_n);
+}
+
+#define LOCAL_SZ_0   16
+#define LOCAL_SZ_1   4
+#define LOCAL_SZ_2   4
+
+LOCAL cl_int
+cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+            size_t src_offset, size_t dst_offset, size_t cb)
+{
+  cl_int ret = CL_SUCCESS;
+  cl_kernel ker = NULL;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {1,1,1};
+  const unsigned int masks[4] = {0xffffffff, 0x0ff, 0x0ffff, 0x0ffffff};
+  int aligned = 0;
+  int dw_src_offset = src_offset/4;
+  int dw_dst_offset = dst_offset/4;
+
+  if (!cb)
+    return ret;
+
+  /* We use one kernel to copy the data. The kernel is lazily created. */
+  assert(src_buf->ctx == dst_buf->ctx);
+
+  /* All 16 bytes aligned, fast and easy one. */
+  if((cb % 16 == 0) && (src_offset % 16 == 0) && (dst_offset % 16 == 0)) {
+    extern char cl_internal_copy_buf_align16_str[];
+    extern size_t cl_internal_copy_buf_align16_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+             cl_internal_copy_buf_align16_str, (size_t)cl_internal_copy_buf_align16_str_size, NULL);
+    cb = cb/16;
+    aligned = 1;
+  } else if ((cb % 4 == 0) && (src_offset % 4 == 0) && (dst_offset % 4 == 0)) { /* all Dword aligned.*/
+    extern char cl_internal_copy_buf_align4_str[];
+    extern size_t cl_internal_copy_buf_align4_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN4,
+             cl_internal_copy_buf_align4_str, (size_t)cl_internal_copy_buf_align4_str_size, NULL);
+    cb = cb/4;
+    aligned = 1;
+  }
+
+  if (aligned) {
+    if (!ker)
+      return CL_OUT_OF_RESOURCES;
+
+    if (cb < LOCAL_SZ_0) {
+      local_sz[0] = 1;
+    } else {
+      local_sz[0] = LOCAL_SZ_0;
+    }
+    global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
+    cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+    cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+    cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+    cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+    cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
+    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    return ret;
+  }
+
+  /* Now handle the unaligned cases. */
+  int dw_num = ((dst_offset % 4 + cb) + 3) / 4;
+  unsigned int first_mask = dst_offset % 4 == 0 ? 0x0 : masks[dst_offset % 4];
+  unsigned int last_mask = masks[(dst_offset + cb) % 4];
+  /* handle the very small range copy. */
+  if (cb < 4 && dw_num == 1) {
+    first_mask = first_mask | ~last_mask;
+  }
+
+  if (cb < LOCAL_SZ_0) {
+    local_sz[0] = 1;
+  } else {
+    local_sz[0] = LOCAL_SZ_0;
+  }
+  global_sz[0] = ((dw_num + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
+
+  if (src_offset % 4 == dst_offset % 4) {
+    /* Src and dst has the same unaligned offset, just handle the
+       header and tail. */
+    extern char cl_internal_copy_buf_unalign_same_offset_str[];
+    extern size_t cl_internal_copy_buf_unalign_same_offset_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET,
+             cl_internal_copy_buf_unalign_same_offset_str,
+             (size_t)cl_internal_copy_buf_unalign_same_offset_str_size, NULL);
+
+    if (!ker)
+      return CL_OUT_OF_RESOURCES;
+
+    cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+    cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+    cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+    cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+    cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
+    cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
+    cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
+    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    return ret;
+  }
+
+  /* Dst's offset < Src's offset, so one dst dword need two sequential src dwords to fill it. */
+  if (dst_offset % 4 < src_offset % 4) {
+    extern char cl_internal_copy_buf_unalign_dst_offset_str[];
+    extern size_t cl_internal_copy_buf_unalign_dst_offset_str_size;
+
+    int align_diff = src_offset % 4 - dst_offset % 4;
+    unsigned int dw_mask = masks[align_diff];
+    int shift = align_diff * 8;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
+             cl_internal_copy_buf_unalign_dst_offset_str,
+             (size_t)cl_internal_copy_buf_unalign_dst_offset_str_size, NULL);
+
+    if (!ker)
+      return CL_OUT_OF_RESOURCES;
+
+    cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+    cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+    cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+    cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+    cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
+    cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
+    cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
+    cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
+    cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
+    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    return ret;
+  }
+
+  /* Dst's offset > Src's offset, so one dst dword need two sequential src - and src to fill it. */
+  if (dst_offset % 4 > src_offset % 4) {
+    extern char cl_internal_copy_buf_unalign_src_offset_str[];
+    extern size_t cl_internal_copy_buf_unalign_src_offset_str_size;
+
+    int align_diff = dst_offset % 4 - src_offset % 4;
+    unsigned int dw_mask = masks[4 - align_diff];
+    int shift = align_diff * 8;
+    int src_less = !(src_offset % 4) && !((src_offset + cb) % 4);
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
+             cl_internal_copy_buf_unalign_src_offset_str,
+             (size_t)cl_internal_copy_buf_unalign_src_offset_str_size, NULL);
+
+    cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+    cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
+    cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
+    cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
+    cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
+    cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
+    cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
+    cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
+    cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
+    cl_kernel_set_arg(ker, 9, sizeof(int), &src_less);
+    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+    return ret;
+  }
+
+  /* no case can hanldle? */
+  assert(0);
+
+  return ret;
+}
+
+LOCAL cl_int
+cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image* src_image,
+           const size_t * origin, const size_t * region)
+{
+  cl_int ret = CL_SUCCESS;
+  cl_kernel ker = NULL;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+
+  if(region[1] == 1) local_sz[1] = 1;
+  if(region[2] == 1) local_sz[2] = 1;
+  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+  if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
+    extern char cl_internal_fill_image_1d_str[];
+    extern size_t cl_internal_fill_image_1d_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_1D,
+        cl_internal_fill_image_1d_str, (size_t)cl_internal_fill_image_1d_str_size, NULL);
+  }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+    extern char cl_internal_fill_image_1d_array_str[];
+    extern size_t cl_internal_fill_image_1d_array_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_1D_ARRAY,
+        cl_internal_fill_image_1d_array_str, (size_t)cl_internal_fill_image_1d_array_str_size, NULL);
+  }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+    extern char cl_internal_fill_image_2d_str[];
+    extern size_t cl_internal_fill_image_2d_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_2D,
+        cl_internal_fill_image_2d_str, (size_t)cl_internal_fill_image_2d_str_size, NULL);
+  }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+    extern char cl_internal_fill_image_2d_array_str[];
+    extern size_t cl_internal_fill_image_2d_array_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_2D_ARRAY,
+        cl_internal_fill_image_2d_array_str, (size_t)cl_internal_fill_image_2d_array_str_size, NULL);
+  }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+    extern char cl_internal_fill_image_3d_str[];
+    extern size_t cl_internal_fill_image_3d_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_3D,
+        cl_internal_fill_image_3d_str, (size_t)cl_internal_fill_image_3d_str_size, NULL);
+  }else{
+    return CL_IMAGE_FORMAT_NOT_SUPPORTED;
+  }
+
+  if (!ker)
+    return CL_OUT_OF_RESOURCES;
+
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image);
+  cl_kernel_set_arg(ker, 1, sizeof(float)*4, pattern);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region[0]);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
+  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
+  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin[0]);
+  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &origin[1]);
+  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &origin[2]);
+
+  ret = cl_command_queue_ND_range(queue, ker, 3, global_off, global_sz, local_sz);
+  return ret;
+}
+
+LOCAL cl_int
+cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+            cl_mem buffer, size_t offset, size_t size)
+{
+  cl_int ret = CL_SUCCESS;
+  cl_kernel ker = NULL;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {1,1,1};
+  char pattern_comb[4];
+  int is_128 = 0;
+  const void * pattern1 = NULL;
+
+  assert(offset % pattern_size == 0);
+  assert(size % pattern_size == 0);
+
+  if (!size)
+    return ret;
+
+  if (pattern_size == 128) {
+    /* 128 is according to pattern of double16, but double works not very
+       well on some platform. We use two float16 to handle this. */
+    extern char cl_internal_fill_buf_align128_str[];
+    extern size_t cl_internal_fill_buf_align128_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN128,
+               cl_internal_fill_buf_align128_str, (size_t)cl_internal_fill_buf_align128_str_size, NULL);
+    is_128 = 1;
+    pattern_size = pattern_size / 2;
+    pattern1 = pattern + pattern_size;
+    size = size / 2;
+  } else if (pattern_size % 8 == 0) { /* Handle the 8 16 32 64 cases here. */
+    extern char cl_internal_fill_buf_align8_str[];
+    extern size_t cl_internal_fill_buf_align8_str_size;
+    int order = ffs(pattern_size / 8) - 1;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 + order,
+               cl_internal_fill_buf_align8_str, (size_t)cl_internal_fill_buf_align8_str_size, NULL);
+  } else if (pattern_size == 4) {
+    extern char cl_internal_fill_buf_align4_str[];
+    extern size_t cl_internal_fill_buf_align4_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
+               cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
+  } else if (size >= 4 && size % 4 == 0 && offset % 4 == 0) {
+    /* The unaligned case. But if copy size and offset are aligned to 4, we can fake
+       the pattern with the pattern duplication fill in. */
+    assert(pattern_size == 1 || pattern_size == 2);
+    extern char cl_internal_fill_buf_align4_str[];
+    extern size_t cl_internal_fill_buf_align4_str_size;
+
+    if (pattern_size == 2) {
+      memcpy(pattern_comb, pattern, sizeof(char)*2);
+      memcpy(pattern_comb + 2, pattern, sizeof(char)*2);
+    } else {
+      pattern_comb[0] = pattern_comb[1] = pattern_comb[2]
+        = pattern_comb[3] = *(char *)pattern;
+    }
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
+               cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
+    pattern_size = 4;
+    pattern = pattern_comb;
+  }
+  //TODO: Unaligned cases, we may need to optimize it as cl_mem_copy, using mask in kernel
+  //functions. This depend on the usage but now we just use aligned 1 and 2.
+  else if (pattern_size == 2) {
+    extern char cl_internal_fill_buf_align2_str[];
+    extern size_t cl_internal_fill_buf_align2_str_size;
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN2,
+               cl_internal_fill_buf_align2_str, (size_t)cl_internal_fill_buf_align2_str_size, NULL);
+  } else if (pattern_size == 1) {
+    extern char cl_internal_fill_buf_unalign_str[];
+    extern size_t cl_internal_fill_buf_unalign_str_size;
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_UNALIGN,
+               cl_internal_fill_buf_unalign_str, (size_t)cl_internal_fill_buf_unalign_str_size, NULL);
+  } else
+    assert(0);
+
+  if (!ker)
+    return CL_OUT_OF_RESOURCES;
+
+  size = size / pattern_size;
+  offset = offset / pattern_size;
+
+  if (size < LOCAL_SZ_0) {
+    local_sz[0] = 1;
+  } else {
+    local_sz[0] = LOCAL_SZ_0;
+  }
+  global_sz[0] = ((size + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0;
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &buffer);
+  cl_kernel_set_arg(ker, 1, pattern_size, pattern);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_uint), &offset);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_uint), &size);
+  if (is_128)
+    cl_kernel_set_arg(ker, 4, pattern_size, pattern1);
+
+  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+  return ret;
+}
+
+LOCAL cl_int
+cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+                       const size_t *src_origin, const size_t *dst_origin, const size_t *region,
+                       size_t src_row_pitch, size_t src_slice_pitch,
+                       size_t dst_row_pitch, size_t dst_slice_pitch) {
+  cl_int ret;
+  cl_kernel ker;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1};
+  if(region[1] == 1) local_sz[1] = 1;
+  if(region[2] == 1) local_sz[2] = 1;
+  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+  cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
+  cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
+
+  /* We use one kernel to copy the data. The kernel is lazily created. */
+  assert(src_buf->ctx == dst_buf->ctx);
+
+  /* setup the kernel and run. */
+  extern char cl_internal_copy_buf_rect_str[];
+  extern size_t cl_internal_copy_buf_rect_str_size;
+
+  ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT,
+      cl_internal_copy_buf_rect_str, (size_t)cl_internal_copy_buf_rect_str_size, NULL);
+
+  if (!ker)
+    return CL_OUT_OF_RESOURCES;
+
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
+  cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_buf);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region[0]);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
+  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
+  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_offset);
+  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_offset);
+  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_row_pitch);
+  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_slice_pitch);
+  cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch);
+  cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch);
+
+  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+  return ret;
+}
+
+LOCAL cl_int
+cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image, struct _cl_mem_image* dst_image,
+                         const size_t *src_origin, const size_t *dst_origin, const size_t *region) {
+  cl_int ret;
+  cl_kernel ker = NULL;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+  uint32_t fixupDataType;
+  uint32_t savedIntelFmt;
+
+  if(region[1] == 1) local_sz[1] = 1;
+  if(region[2] == 1) local_sz[2] = 1;
+  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+  switch (src_image->fmt.image_channel_data_type) {
+    case CL_SNORM_INT8:
+    case CL_UNORM_INT8:  fixupDataType = CL_UNSIGNED_INT8; break;
+    case CL_HALF_FLOAT:
+    case CL_SNORM_INT16:
+    case CL_UNORM_INT16: fixupDataType = CL_UNSIGNED_INT16; break;
+    case CL_FLOAT:       fixupDataType = CL_UNSIGNED_INT32; break;
+    default:
+      fixupDataType = 0;
+  }
+
+  if (fixupDataType) {
+    cl_image_format fmt;
+    if (src_image->fmt.image_channel_order != CL_BGRA)
+      fmt.image_channel_order = src_image->fmt.image_channel_order;
+    else
+      fmt.image_channel_order = CL_RGBA;
+    fmt.image_channel_data_type = fixupDataType;
+    savedIntelFmt = src_image->intel_fmt;
+    src_image->intel_fmt = cl_image_get_intel_format(&fmt);
+    dst_image->intel_fmt = src_image->intel_fmt;
+  }
+
+  /* We use one kernel to copy the data. The kernel is lazily created. */
+  assert(src_image->base.ctx == dst_image->base.ctx);
+
+  /* setup the kernel and run. */
+  if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
+    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
+      extern char cl_internal_copy_image_1d_to_1d_str[];
+      extern size_t cl_internal_copy_image_1d_to_1d_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_1D_TO_1D,
+          cl_internal_copy_image_1d_to_1d_str, (size_t)cl_internal_copy_image_1d_to_1d_str_size, NULL);
+    }
+  } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+      extern char cl_internal_copy_image_2d_to_2d_str[];
+      extern size_t cl_internal_copy_image_2d_to_2d_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,
+          cl_internal_copy_image_2d_to_2d_str, (size_t)cl_internal_copy_image_2d_to_2d_str_size, NULL);
+    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+      extern char cl_internal_copy_image_2d_to_3d_str[];
+      extern size_t cl_internal_copy_image_2d_to_3d_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_3D,
+          cl_internal_copy_image_2d_to_3d_str, (size_t)cl_internal_copy_image_2d_to_3d_str_size, NULL);
+    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+
+      cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+      return CL_SUCCESS;
+    }
+  } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
+
+      cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+      return CL_SUCCESS;
+    }
+  } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+
+      cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+      return CL_SUCCESS;
+    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+      cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+      return CL_SUCCESS;
+    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+      cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+      return CL_SUCCESS;
+    }
+  } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+      extern char cl_internal_copy_image_3d_to_2d_str[];
+      extern size_t cl_internal_copy_image_3d_to_2d_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,
+          cl_internal_copy_image_3d_to_2d_str, (size_t)cl_internal_copy_image_3d_to_2d_str_size, NULL);
+    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+      extern char cl_internal_copy_image_3d_to_3d_str[];
+      extern size_t cl_internal_copy_image_3d_to_3d_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_3D,
+          cl_internal_copy_image_3d_to_3d_str, (size_t)cl_internal_copy_image_3d_to_3d_str_size, NULL);
+    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
+      cl_mem_copy_image_to_image(dst_origin, src_origin, region, dst_image, src_image);
+      return CL_SUCCESS;
+    }
+  }
+
+  if (!ker) {
+    ret = CL_OUT_OF_RESOURCES;
+    goto fail;
+  }
+
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image);
+  cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_image);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region[0]);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
+  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
+  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_origin[0]);
+  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]);
+  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
+  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_origin[0]);
+  cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_origin[1]);
+  cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_origin[2]);
+
+  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+fail:
+  if (fixupDataType) {
+    src_image->intel_fmt = savedIntelFmt;
+    dst_image->intel_fmt = savedIntelFmt;
+  }
+  return ret;
+}
+
+LOCAL cl_int
+cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image, cl_mem buffer,
+                         const size_t *src_origin, const size_t dst_offset, const size_t *region) {
+  cl_int ret;
+  cl_kernel ker = NULL;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+  uint32_t intel_fmt, bpp;
+  cl_image_format fmt;
+  size_t origin0, region0;
+
+  if(region[1] == 1) local_sz[1] = 1;
+  if(region[2] == 1) local_sz[2] = 1;
+  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+  /* We use one kernel to copy the data. The kernel is lazily created. */
+  assert(image->base.ctx == buffer->ctx);
+
+  fmt.image_channel_order = CL_R;
+  fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+  intel_fmt = image->intel_fmt;
+  bpp = image->bpp;
+  image->intel_fmt = cl_image_get_intel_format(&fmt);
+  image->w = image->w * image->bpp;
+  image->bpp = 1;
+  region0 = region[0] * bpp;
+  origin0 = src_origin[0] * bpp;
+  global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+
+  /* setup the kernel and run. */
+  if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+      extern char cl_internal_copy_image_2d_to_buffer_str[];
+      extern size_t cl_internal_copy_image_2d_to_buffer_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,
+          cl_internal_copy_image_2d_to_buffer_str, (size_t)cl_internal_copy_image_2d_to_buffer_str_size, NULL);
+  }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+    extern char cl_internal_copy_image_3d_to_buffer_str[];
+    extern size_t cl_internal_copy_image_3d_to_buffer_str_size;
+
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,
+          cl_internal_copy_image_3d_to_buffer_str, (size_t)cl_internal_copy_image_3d_to_buffer_str_size, NULL);
+  }
+
+  if (!ker) {
+    ret = CL_OUT_OF_RESOURCES;
+    goto fail;
+  }
+
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image);
+  cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region0);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
+  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
+  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
+  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]);
+  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
+  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_offset);
+
+  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+fail:
+
+  image->intel_fmt = intel_fmt;
+  image->bpp = bpp;
+  image->w = image->w / bpp;
+
+  return ret;
+}
+
+
+LOCAL cl_int
+cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_mem_image* image,
+                         const size_t src_offset, const size_t *dst_origin, const size_t *region) {
+  cl_int ret;
+  cl_kernel ker = NULL;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
+  uint32_t intel_fmt, bpp;
+  cl_image_format fmt;
+  size_t origin0, region0;
+
+  if(region[1] == 1) local_sz[1] = 1;
+  if(region[2] == 1) local_sz[2] = 1;
+  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
+  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
+
+  /* We use one kernel to copy the data. The kernel is lazily created. */
+  assert(image->base.ctx == buffer->ctx);
+
+  fmt.image_channel_order = CL_R;
+  fmt.image_channel_data_type = CL_UNSIGNED_INT8;
+  intel_fmt = image->intel_fmt;
+  bpp = image->bpp;
+  image->intel_fmt = cl_image_get_intel_format(&fmt);
+  image->w = image->w * image->bpp;
+  image->bpp = 1;
+  region0 = region[0] * bpp;
+  origin0 = dst_origin[0] * bpp;
+  global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
+
+  /* setup the kernel and run. */
+  if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
+      extern char cl_internal_copy_buffer_to_image_2d_str[];
+      extern size_t cl_internal_copy_buffer_to_image_2d_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,
+          cl_internal_copy_buffer_to_image_2d_str, (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL);
+  }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
+      extern char cl_internal_copy_buffer_to_image_3d_str[];
+      extern size_t cl_internal_copy_buffer_to_image_3d_str_size;
+
+      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,
+          cl_internal_copy_buffer_to_image_3d_str, (size_t)cl_internal_copy_buffer_to_image_3d_str_size, NULL);
+  }
+  if (!ker)
+    return CL_OUT_OF_RESOURCES;
+
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image);
+  cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region0);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
+  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
+  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
+  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]);
+  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
+  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_offset);
+
+  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+
+  image->intel_fmt = intel_fmt;
+  image->bpp = bpp;
+  image->w = image->w / bpp;
+
+  return ret;
+}
+
+
+LOCAL void*
+cl_mem_map(cl_mem mem)
+{
+  cl_buffer_map(mem->bo, 1);
+  assert(cl_buffer_get_virtual(mem->bo));
+  return cl_buffer_get_virtual(mem->bo);
+}
+
+LOCAL cl_int
+cl_mem_unmap(cl_mem mem)
+{
+  cl_buffer_unmap(mem->bo);
+  return CL_SUCCESS;
+}
+
+LOCAL void*
+cl_mem_map_gtt(cl_mem mem)
+{
+  cl_buffer_map_gtt(mem->bo);
+  assert(cl_buffer_get_virtual(mem->bo));
+  mem->mapped_gtt = 1;
+  return cl_buffer_get_virtual(mem->bo);
+}
+
+LOCAL void *
+cl_mem_map_gtt_unsync(cl_mem mem)
+{
+  cl_buffer_map_gtt_unsync(mem->bo);
+  assert(cl_buffer_get_virtual(mem->bo));
+  return cl_buffer_get_virtual(mem->bo);
+}
+
+LOCAL cl_int
+cl_mem_unmap_gtt(cl_mem mem)
+{
+  cl_buffer_unmap_gtt(mem->bo);
+  return CL_SUCCESS;
+}
+
+LOCAL void*
+cl_mem_map_auto(cl_mem mem)
+{
+  if (IS_IMAGE(mem) && cl_mem_image(mem)->tiling != CL_NO_TILE)
+    return cl_mem_map_gtt(mem);
+  else
+    return cl_mem_map(mem);
+}
+
+LOCAL cl_int
+cl_mem_unmap_auto(cl_mem mem)
+{
+  if (mem->mapped_gtt == 1) {
+    cl_buffer_unmap_gtt(mem->bo);
+    mem->mapped_gtt = 0;
+  }
+  else
+    cl_buffer_unmap(mem->bo);
+  return CL_SUCCESS;
+}
+
+LOCAL cl_int
+cl_mem_pin(cl_mem mem)
+{
+  assert(mem);
+  if (UNLIKELY((mem->flags & CL_MEM_PINNABLE) == 0))
+    return CL_INVALID_MEM_OBJECT;
+  cl_buffer_pin(mem->bo, 4096);
+  return CL_SUCCESS;
+}
+
+LOCAL cl_int
+cl_mem_unpin(cl_mem mem)
+{
+  assert(mem);
+  if (UNLIKELY((mem->flags & CL_MEM_PINNABLE) == 0))
+    return CL_INVALID_MEM_OBJECT;
+  cl_buffer_unpin(mem->bo);
+  return CL_SUCCESS;
+}
+
+LOCAL cl_mem cl_mem_new_libva_buffer(cl_context ctx,
+                                     unsigned int bo_name,
+                                     cl_int* errcode)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+
+  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, 0, 0, CL_FALSE, &err);
+  if (mem == NULL || err != CL_SUCCESS)
+    goto error;
+
+  size_t sz = 0;
+  mem->bo = cl_buffer_get_buffer_from_libva(ctx, bo_name, &sz);
+  mem->size = sz;
+
+exit:
+  if (errcode)
+    *errcode = err;
+  return mem;
+
+error:
+  cl_mem_delete(mem);
+  mem = NULL;
+  goto exit;
+}
+
+LOCAL cl_mem cl_mem_new_libva_image(cl_context ctx,
+                                    unsigned int bo_name, size_t offset,
+                                    size_t width, size_t height,
+                                    cl_image_format fmt,
+                                    size_t row_pitch,
+                                    cl_int *errcode)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+  struct _cl_mem_image *image = NULL;
+  uint32_t intel_fmt, bpp;
+
+  /* Get the size of each pixel */
+  if (UNLIKELY((err = cl_image_byte_per_pixel(&fmt, &bpp)) != CL_SUCCESS))
+    goto error;
+
+  intel_fmt = cl_image_get_intel_format(&fmt);
+  if (intel_fmt == INTEL_UNSUPPORTED_FORMAT) {
+    err = CL_IMAGE_FORMAT_NOT_SUPPORTED;
+    goto error;
+  }
+
+  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, &err);
+  if (mem == NULL || err != CL_SUCCESS) {
+    err = CL_OUT_OF_HOST_MEMORY;
+    goto error;
+  }
+
+  image = cl_mem_image(mem);
+
+  mem->bo = cl_buffer_get_image_from_libva(ctx, bo_name, image, offset);
+
+  image->w = width;
+  image->h = height;
+  image->image_type = CL_MEM_OBJECT_IMAGE2D;
+  image->depth = 2;
+  image->fmt = fmt;
+  image->intel_fmt = intel_fmt;
+  image->bpp = bpp;
+  image->row_pitch = row_pitch;
+  image->slice_pitch = 0;
+  // NOTE: tiling of image is set in cl_buffer_get_image_from_libva().
+  image->tile_x = 0;
+  image->tile_y = 0;
+  image->offset = offset;
+
+exit:
+  if (errcode)
+    *errcode = err;
+  return mem;
+
+error:
+  cl_mem_delete(mem);
+  mem = NULL;
+  goto exit;
+}
+
+LOCAL cl_int
+cl_mem_get_fd(cl_mem mem,
+              int* fd)
+{
+  cl_int err = CL_SUCCESS;
+  if(cl_buffer_get_fd(mem->bo, fd))
+	err = CL_INVALID_OPERATION;
+  return err;
+}
diff --git a/src/cl_mem.h b/src/cl_mem.h
new file mode 100644
index 0000000..3174c5c
--- /dev/null
+++ b/src/cl_mem.h
@@ -0,0 +1,290 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_MEM_H__
+#define __CL_MEM_H__
+
+#include "cl_internals.h"
+#include "cl_driver_type.h"
+#include "CL/cl.h"
+#include "cl_khr_icd.h"
+#include <assert.h>
+
+#ifndef CL_VERSION_1_2
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+    cl_mem                  buffer;
+} cl_image_desc;
+#endif
+
+typedef enum cl_image_tiling {
+  CL_NO_TILE = 0,
+  CL_TILE_X  = 1,
+  CL_TILE_Y  = 2
+} cl_image_tiling_t;
+
+typedef struct _cl_mapped_ptr {
+  void * ptr;
+  void * v_ptr;
+  size_t size;
+  size_t origin[3];  /* mapped origin */
+  size_t region[3];  /* mapped region */
+}cl_mapped_ptr;
+
+typedef struct _cl_mem_dstr_cb {
+  struct _cl_mem_dstr_cb * next;
+  void (CL_CALLBACK *pfn_notify)(cl_mem memobj, void *user_data);
+  void *user_data;
+}cl_mem_dstr_cb;
+
+/* Used for buffers and images */
+enum cl_mem_type {
+  CL_MEM_BUFFER_TYPE,
+  CL_MEM_SUBBUFFER_TYPE,
+  CL_MEM_IMAGE_TYPE,
+  CL_MEM_GL_IMAGE_TYPE,
+};
+#define IS_IMAGE(mem) (mem->type >= CL_MEM_IMAGE_TYPE)
+#define IS_GL_IMAGE(mem) (mem->type == CL_MEM_GL_IMAGE_TYPE)
+
+typedef  struct _cl_mem {
+  DEFINE_ICD(dispatch)
+  uint64_t magic;           /* To identify it as a memory object */
+  cl_mem prev, next;        /* We chain the memory buffers together */
+  enum cl_mem_type type;
+  volatile int ref_n;       /* This object is reference counted */
+  cl_buffer bo;             /* Data in GPU memory */
+  size_t size;              /* original request size, not alignment size, used in constant buffer */
+  cl_context ctx;           /* Context it belongs to */
+  cl_mem_flags flags;       /* Flags specified at the creation time */
+  void * host_ptr;          /* Pointer of the host mem specified by CL_MEM_ALLOC_HOST_PTR */
+  cl_mapped_ptr* mapped_ptr;/* Store the mapped addresses and size by caller. */
+  int mapped_ptr_sz;        /* The array size of mapped_ptr. */
+  int map_ref;              /* The mapped count. */
+  uint8_t mapped_gtt;       /* This object has mapped gtt, for unmap. */
+  cl_mem_dstr_cb *dstr_cb;  /* The destroy callback. */
+} _cl_mem;
+
+struct _cl_mem_image {
+  _cl_mem base;
+  cl_image_format fmt;            /* only for images */
+  uint32_t intel_fmt;             /* format to provide in the surface state */
+  uint32_t bpp;                   /* number of bytes per pixel */
+  cl_mem_object_type image_type;  /* only for images 1D/2D...*/
+  size_t w, h, depth;             /* only for images (depth is only for 3D images) */
+  size_t row_pitch, slice_pitch;
+  size_t host_row_pitch, host_slice_pitch;
+  cl_image_tiling_t tiling;       /* only IVB+ supports TILE_[X,Y] (image only) */
+  size_t tile_x, tile_y;          /* tile offset, used for mipmap images.  */
+  size_t offset;                  /* offset for dri_bo, used when it's reloc. */
+  cl_mem buffer_1d;               /* if the image is created from buffer, it point to the buffer.*/
+};
+
+struct _cl_mem_gl_image {
+  struct _cl_mem_image base;
+  uint32_t target;
+  int      miplevel;
+  uint32_t texture;
+};
+
+inline static void
+cl_mem_image_init(struct _cl_mem_image *image, size_t w, size_t h,
+                  cl_mem_object_type image_type,
+                  size_t depth, cl_image_format fmt,
+                  uint32_t intel_fmt, uint32_t bpp,
+                  size_t row_pitch, size_t slice_pitch,
+                  cl_image_tiling_t tiling,
+                  size_t tile_x, size_t tile_y,
+                  size_t offset)
+{
+  image->w = w;
+  image->h = h;
+  image->image_type = image_type;
+  image->depth = depth;
+  image->fmt = fmt;
+  image->intel_fmt = intel_fmt;
+  image->bpp = bpp;
+  image->row_pitch = row_pitch;
+  image->slice_pitch = slice_pitch;
+  image->tiling = tiling;
+  image->tile_x = tile_x;
+  image->tile_y = tile_y;
+  image->offset = offset;
+}
+
+struct _cl_mem_buffer {
+  _cl_mem base;
+  struct _cl_mem_buffer* subs;         /* Sub buf objects. */
+  size_t sub_offset;                   /* The sub start offset. */
+  struct _cl_mem_buffer* sub_prev, *sub_next;/* We chain the sub memory buffers together */
+  pthread_mutex_t sub_lock;            /* Sub buffers list lock*/
+  struct _cl_mem_buffer* parent;       /* Point to the parent buffer if is sub-buffer */
+};
+
+inline static struct _cl_mem_image *
+cl_mem_image(cl_mem mem)
+{
+  assert(IS_IMAGE(mem));
+  return (struct _cl_mem_image *)mem;
+}
+
+inline static struct _cl_mem_gl_image *
+cl_mem_gl_image(cl_mem mem)
+{
+  assert(IS_GL_IMAGE(mem));
+  return (struct _cl_mem_gl_image*)mem;
+}
+
+inline static struct _cl_mem_buffer *
+cl_mem_buffer(cl_mem mem)
+{
+  assert(!IS_IMAGE(mem));
+  return (struct _cl_mem_buffer *)mem;
+}
+
+/* Query information about a memory object */
+extern cl_int cl_get_mem_object_info(cl_mem, cl_mem_info, size_t, void *, size_t *);
+
+/* Query information about an image */
+extern cl_int cl_get_image_info(cl_mem, cl_image_info, size_t, void *, size_t *);
+
+/* Query whether mem is in buffers */
+extern cl_int is_valid_mem(cl_mem mem, cl_mem buffers);
+
+/* Create a new memory object and initialize it with possible user data */
+extern cl_mem cl_mem_new_buffer(cl_context, cl_mem_flags, size_t, void*, cl_int*);
+
+/* Create a new sub memory object */
+extern cl_mem cl_mem_new_sub_buffer(cl_mem, cl_mem_flags, cl_buffer_create_type, const void *, cl_int *);
+
+/* Idem but this is an image */
+extern cl_mem
+cl_mem_new_image(cl_context context,
+                 cl_mem_flags flags,
+                 const cl_image_format *image_format,
+                 const cl_image_desc *image_desc,
+                 void *host_ptr,
+                 cl_int *errcode_ret);
+
+/* Unref the object and delete it if no more reference */
+extern void cl_mem_delete(cl_mem);
+
+/* Destroy egl image. */
+extern void cl_mem_gl_delete(struct _cl_mem_gl_image *);
+
+/* Add one more reference to this object */
+extern void cl_mem_add_ref(cl_mem);
+
+/* api clEnqueueCopyBuffer help function */
+extern cl_int cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
+              size_t src_offset, size_t dst_offset, size_t cb);
+
+extern cl_int cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+              cl_mem buffer, size_t offset, size_t size);
+
+extern cl_int cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image*,
+                                    const size_t *, const size_t *);
+
+/* api clEnqueueCopyBufferRect help function */
+extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
+                                     const size_t *, const size_t *, const size_t *,
+                                     size_t, size_t, size_t, size_t);
+
+/* api clEnqueueCopyImage help function */
+extern cl_int cl_mem_kernel_copy_image(cl_command_queue, struct _cl_mem_image*, struct _cl_mem_image*,
+                                       const size_t *, const size_t *, const size_t *);
+
+/* api clEnqueueCopyImageToBuffer help function */
+extern cl_int cl_mem_copy_image_to_buffer(cl_command_queue, struct _cl_mem_image*, cl_mem,
+                                          const size_t *, const size_t, const size_t *);
+
+/* api clEnqueueCopyBufferToImage help function */
+extern cl_int cl_mem_copy_buffer_to_image(cl_command_queue, cl_mem, struct _cl_mem_image*,
+                                          const size_t, const size_t *, const size_t *);
+
+/* Directly map a memory object */
+extern void *cl_mem_map(cl_mem);
+
+/* Unmap a memory object */
+extern cl_int cl_mem_unmap(cl_mem);
+
+/* Directly map a memory object in GTT mode */
+extern void *cl_mem_map_gtt(cl_mem);
+
+/* Directly map a memory object in GTT mode, with out waiting gpu idle */
+extern void *cl_mem_map_gtt_unsync(cl_mem);
+
+/* Unmap a memory object in GTT mode */
+extern cl_int cl_mem_unmap_gtt(cl_mem);
+
+/* Directly map a memory object - tiled images are mapped in GTT mode */
+extern void *cl_mem_map_auto(cl_mem);
+
+/* Unmap a memory object - tiled images are unmapped in GTT mode */
+extern cl_int cl_mem_unmap_auto(cl_mem);
+
+/* Pin/unpin the buffer in memory (you must be root) */
+extern cl_int cl_mem_pin(cl_mem);
+extern cl_int cl_mem_unpin(cl_mem);
+
+extern cl_mem
+cl_mem_allocate(enum cl_mem_type type,
+                cl_context ctx,
+                cl_mem_flags flags,
+                size_t sz,
+                cl_int is_tiled,
+                cl_int *errcode);
+
+void
+cl_mem_copy_image_region(const size_t *origin, const size_t *region,
+                         void *dst, size_t dst_row_pitch, size_t dst_slice_pitch,
+                         const void *src, size_t src_row_pitch, size_t src_slice_pitch,
+                         const struct _cl_mem_image *image, cl_bool offset_dst, cl_bool offset_src);
+
+void
+cl_mem_copy_image_to_image(const size_t *dst_origin,const size_t *src_origin, const size_t *region,
+                           const struct _cl_mem_image *dst_image, const struct _cl_mem_image *src_image);
+
+extern cl_mem cl_mem_new_libva_buffer(cl_context ctx,
+                                      unsigned int bo_name,
+                                      cl_int *errcode);
+
+extern cl_mem cl_mem_new_libva_image(cl_context ctx,
+                                     unsigned int bo_name, size_t offset,
+                                     size_t width, size_t height,
+                                     cl_image_format fmt,
+                                     size_t row_pitch,
+                                     cl_int *errcode);
+extern cl_int cl_mem_get_fd(cl_mem mem, int* fd);
+
+
+#endif /* __CL_MEM_H__ */
+
diff --git a/src/cl_mem_gl.c b/src/cl_mem_gl.c
new file mode 100644
index 0000000..28d2ac6
--- /dev/null
+++ b/src/cl_mem_gl.c
@@ -0,0 +1,97 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Zhigang Gong <zhigang.gong at intel.com>
+ */
+#include <GL/gl.h>
+#include <GL/glext.h>
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include <assert.h>
+#include <stdio.h>
+
+#include "cl_mem.h"
+#include "cl_image.h"
+#include "cl_context.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_device_id.h"
+#include "cl_driver.h"
+#include "cl_platform_id.h"
+#include "cl_mem_gl.h"
+
+#include "CL/cl.h"
+#include "CL/cl_intel.h"
+#include "CL/cl_gl.h"
+
+
+LOCAL cl_mem
+cl_mem_new_gl_buffer(cl_context ctx,
+                     cl_mem_flags flags,
+                     GLuint buf_obj,
+                     cl_int *errcode_ret)
+{
+  NOT_IMPLEMENTED;
+}
+
+LOCAL cl_mem
+cl_mem_new_gl_texture(cl_context ctx,
+                      cl_mem_flags flags,
+                      GLenum texture_target,
+                      GLint miplevel,
+                      GLuint texture,
+                      cl_int *errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+  /* Check flags consistency */
+  if (UNLIKELY(flags & CL_MEM_COPY_HOST_PTR)) {
+    err = CL_INVALID_ARG_VALUE;
+    goto error;
+  }
+
+  mem = cl_mem_allocate(CL_MEM_GL_IMAGE_TYPE, ctx, flags, 0, 0, &err);
+  if (mem == NULL || err != CL_SUCCESS)
+    goto error;
+
+  mem->bo = cl_buffer_alloc_from_texture(ctx, texture_target, miplevel,
+                                         texture, cl_mem_image(mem));
+  if (UNLIKELY(mem->bo == NULL)) {
+    err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+    goto error;
+  }
+
+  cl_mem_gl_image(mem)->target = texture_target;
+  cl_mem_gl_image(mem)->miplevel = miplevel;
+  cl_mem_gl_image(mem)->texture = texture;
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return mem;
+error:
+  cl_mem_delete(mem);
+  mem = NULL;
+  goto exit;
+
+}
+
+LOCAL void cl_mem_gl_delete(struct _cl_mem_gl_image *gl_image)
+{
+  if (gl_image->base.base.bo != NULL)
+    cl_buffer_release_from_texture(gl_image->base.base.ctx, gl_image->target,
+                                   gl_image->miplevel, gl_image->texture);
+}
diff --git a/src/cl_mem_gl.h b/src/cl_mem_gl.h
new file mode 100644
index 0000000..717ccfb
--- /dev/null
+++ b/src/cl_mem_gl.h
@@ -0,0 +1,17 @@
+#ifndef __CL_MEM_GL_H__
+#define __CL_MEM_GL_H__
+#include "cl_mem.h"
+
+cl_mem cl_mem_new_gl_buffer(cl_context ctx,
+                            cl_mem_flags flags,
+                            GLuint buf_obj,
+                            cl_int *errcode_ret);
+
+cl_mem cl_mem_new_gl_texture(cl_context ctx,
+                             cl_mem_flags flags,
+                             GLenum texture_target,
+                             GLint miplevel,
+                             GLuint texture,
+                             cl_int *errcode_ret);
+
+#endif
diff --git a/src/cl_platform_id.c b/src/cl_platform_id.c
new file mode 100644
index 0000000..e7c8d6a
--- /dev/null
+++ b/src/cl_platform_id.c
@@ -0,0 +1,112 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_platform_id.h"
+#include "cl_internals.h"
+#include "cl_utils.h"
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#define DECL_INFO_STRING(FIELD, STRING) \
+    .FIELD = STRING,                    \
+    .JOIN(FIELD,_sz) = sizeof(STRING),
+
+static struct _cl_platform_id intel_platform_data = {
+  INIT_ICD(dispatch)
+  DECL_INFO_STRING(profile, "FULL_PROFILE")
+  DECL_INFO_STRING(version, LIBCL_VERSION_STRING)
+  DECL_INFO_STRING(name, "Intel Gen OCL Driver")
+  DECL_INFO_STRING(vendor, "Intel")
+  DECL_INFO_STRING(icd_suffix_khr, "Intel")
+};
+
+#undef DECL_INFO_STRING
+
+/* Intel platform (only GPU now) */
+cl_platform_id const intel_platform = &intel_platform_data;
+
+LOCAL cl_int
+cl_get_platform_ids(cl_uint          num_entries,
+                    cl_platform_id * platforms,
+                    cl_uint *        num_platforms)
+{
+  if (num_platforms != NULL)
+    *num_platforms = 1;
+
+  cl_intel_platform_extension_init(intel_platform);
+  /* Easy right now, only one platform is supported */
+  if(platforms)
+    *platforms = intel_platform;
+  intel_platform->extensions_sz = strlen(intel_platform->extensions) + 1;
+  return CL_SUCCESS;
+}
+
+#define DECL_FIELD(CASE,FIELD)                                  \
+  case JOIN(CL_,CASE):                                          \
+    if (param_value_size < intel_platform->JOIN(FIELD,_sz))     \
+      return CL_INVALID_VALUE;                                  \
+    if (param_value_size_ret != NULL)                           \
+      *param_value_size_ret = intel_platform->JOIN(FIELD,_sz);  \
+    memcpy(param_value,                                         \
+           intel_platform->FIELD,                               \
+           intel_platform->JOIN(FIELD,_sz));                    \
+      return CL_SUCCESS;
+
+#define GET_FIELD_SZ(CASE,FIELD)                                \
+  case JOIN(CL_,CASE):                                          \
+    if (param_value_size_ret != NULL)                           \
+      *param_value_size_ret = intel_platform->JOIN(FIELD,_sz);  \
+    return CL_SUCCESS;
+
+LOCAL cl_int
+cl_get_platform_info(cl_platform_id    platform,
+                     cl_platform_info  param_name,
+                     size_t            param_value_size,
+                     void *            param_value,
+                     size_t *          param_value_size_ret)
+{
+  if (param_value == NULL) {
+    switch (param_name) {
+      GET_FIELD_SZ (PLATFORM_PROFILE,    profile);
+      GET_FIELD_SZ (PLATFORM_VERSION,    version);
+      GET_FIELD_SZ (PLATFORM_NAME,       name);
+      GET_FIELD_SZ (PLATFORM_VENDOR,     vendor);
+      GET_FIELD_SZ (PLATFORM_EXTENSIONS, extensions);
+      GET_FIELD_SZ (PLATFORM_ICD_SUFFIX_KHR, icd_suffix_khr);
+      default: return CL_INVALID_VALUE;
+    }
+  }
+
+  /* Fetch the platform inform */
+  switch (param_name) {
+    DECL_FIELD (PLATFORM_PROFILE,    profile);
+    DECL_FIELD (PLATFORM_VERSION,    version);
+    DECL_FIELD (PLATFORM_NAME,       name);
+    DECL_FIELD (PLATFORM_VENDOR,     vendor);
+    DECL_FIELD (PLATFORM_EXTENSIONS, extensions);
+    DECL_FIELD (PLATFORM_ICD_SUFFIX_KHR, icd_suffix_khr);
+    default: return CL_INVALID_VALUE;
+  }
+}
+
+#undef DECL_FIELD
+
diff --git a/src/cl_platform_id.h b/src/cl_platform_id.h
new file mode 100644
index 0000000..c7c716e
--- /dev/null
+++ b/src/cl_platform_id.h
@@ -0,0 +1,72 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_PLATFORM_ID_H__
+#define __CL_PLATFORM_ID_H__
+
+#include "cl_internals.h"
+#include "cl_extensions.h"
+#include "cl_khr_icd.h"
+#include "CL/cl.h"
+
+#include "src/OCLConfig.h"
+
+struct _cl_platform_id {
+  DEFINE_ICD(dispatch)
+  const char *profile;
+  const char *version;
+  const char *name;
+  const char *vendor;
+  char *extensions;
+  const char *icd_suffix_khr;
+  size_t profile_sz;
+  size_t version_sz;
+  size_t name_sz;
+  size_t vendor_sz;
+  size_t extensions_sz;
+  size_t icd_suffix_khr_sz;
+  struct cl_extensions *internal_extensions;
+};
+
+/* Platform implemented by this run-time */
+extern cl_platform_id const intel_platform;
+
+/* Return the valid platform */
+extern cl_int cl_get_platform_ids(cl_uint          num_entries,
+                                  cl_platform_id * platforms,
+                                  cl_uint *        num_platforms);
+
+/* Return information for the current platform */
+extern cl_int cl_get_platform_info(cl_platform_id    platform,
+                                   cl_platform_info  param_name,
+                                   size_t            param_value_size,
+                                   void *            param_value,
+                                   size_t *          param_value_size_ret);
+
+#define _STR(x) #x
+#define _JOINT(x, y) _STR(x) "." _STR(y)
+#define _JOINT3(x, y, z) _STR(x) "." _STR(y) "." _STR(z)
+
+
+#define LIBCL_DRIVER_VERSION_STRING _JOINT3(LIBCL_DRIVER_VERSION_MAJOR, LIBCL_DRIVER_VERSION_MINOR, LIBCL_DRIVER_VERSION_PATCH)
+#define LIBCL_VERSION_STRING "OpenCL " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR) " beignet " LIBCL_DRIVER_VERSION_STRING
+#define LIBCL_C_VERSION_STRING "OpenCL C " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR) " beignet " LIBCL_DRIVER_VERSION_STRING
+
+#endif /* __CL_PLATFORM_ID_H__ */
+
diff --git a/src/cl_program.c b/src/cl_program.c
new file mode 100644
index 0000000..79dff34
--- /dev/null
+++ b/src/cl_program.c
@@ -0,0 +1,851 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_kernel.h"
+#include "cl_program.h"
+#include "cl_device_id.h"
+#include "cl_context.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+#include "cl_khr_icd.h"
+#include "cl_gbe_loader.h"
+#include "CL/cl.h"
+#include "CL/cl_intel.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <libgen.h>
+
+static void
+cl_program_release_sources(cl_program p)
+{
+  if (p->source) {
+    cl_free(p->source);
+    p->source = NULL;
+  }
+}
+
+static void
+cl_program_release_binary(cl_program p)
+{
+  if (p->binary) {
+    cl_free(p->binary);
+    p->binary = NULL;
+  }
+}
+
+LOCAL void
+cl_program_delete(cl_program p)
+{
+  uint32_t ref, i;
+
+  if (p == NULL)
+    return;
+
+  /* We are not done with it yet */
+  if ((ref = atomic_dec(&p->ref_n)) > 1) return;
+
+  /* Destroy the sources and binary if still allocated */
+  cl_program_release_sources(p);
+  cl_program_release_binary(p);
+
+  /* Release the build options. */
+  if (p->build_opts) {
+    cl_free(p->build_opts);
+    p->build_opts = NULL;
+  }
+
+  if (p->build_log) {
+    free(p->build_log);
+    p->build_log = NULL;
+  }
+
+  /* Remove it from the list */
+  assert(p->ctx);
+  pthread_mutex_lock(&p->ctx->program_lock);
+    if (p->prev)
+      p->prev->next = p->next;
+    if (p->next)
+      p->next->prev = p->prev;
+    if (p->ctx->programs == p)
+      p->ctx->programs = p->next;
+  pthread_mutex_unlock(&p->ctx->program_lock);
+
+  cl_free(p->bin);               /* Free the blob */
+  for (i = 0; i < p->ker_n; ++i) /* Free the kernels */
+    cl_kernel_delete(p->ker[i]);
+  cl_free(p->ker);
+
+  /* Program belongs to their parent context */
+  cl_context_delete(p->ctx);
+
+  /* Free the program as allocated by the compiler */
+  if (p->opaque) {
+    if (CompilerSupported())
+      compiler_program_clean_llvm_resource(p->opaque);
+    interp_program_delete(p->opaque);
+  }
+
+  p->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
+  cl_free(p);
+}
+
+LOCAL cl_program
+cl_program_new(cl_context ctx)
+{
+  cl_program p = NULL;
+
+  /* Allocate the structure */
+  TRY_ALLOC_NO_ERR (p, CALLOC(struct _cl_program));
+  SET_ICD(p->dispatch)
+  p->build_status = CL_BUILD_NONE;
+  p->ref_n = 1;
+  p->magic = CL_MAGIC_PROGRAM_HEADER;
+  p->ctx = ctx;
+  p->build_log = calloc(1000, sizeof(char));
+  if (p->build_log)
+    p->build_log_max_sz = 1000;
+  /* The queue also belongs to its context */
+  cl_context_add_ref(ctx);
+
+exit:
+  return p;
+error:
+  cl_program_delete(p);
+  goto exit;
+}
+
+LOCAL void
+cl_program_add_ref(cl_program p)
+{
+  assert(p);
+  atomic_inc(&p->ref_n);
+}
+
+static cl_int
+cl_program_load_gen_program(cl_program p)
+{
+  cl_int err = CL_SUCCESS;
+  uint32_t i;
+
+  assert(p->opaque != NULL);
+  p->ker_n = interp_program_get_kernel_num(p->opaque);
+
+  /* Allocate the kernel array */
+  TRY_ALLOC (p->ker, CALLOC_ARRAY(cl_kernel, p->ker_n));
+
+  for (i = 0; i < p->ker_n; ++i) {
+    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
+    assert(opaque != NULL);
+    TRY_ALLOC (p->ker[i], cl_kernel_new(p));
+    cl_kernel_setup(p->ker[i], opaque);
+  }
+
+error:
+  return err;
+}
+
+inline cl_bool isBitcodeWrapper(const unsigned char *BufPtr, const unsigned char *BufEnd)
+{
+  // See if you can find the hidden message in the magic bytes :-).
+  // (Hint: it's a little-endian encoding.)
+  return BufPtr != BufEnd &&
+    BufPtr[0] == 0xDE &&
+    BufPtr[1] == 0xC0 &&
+    BufPtr[2] == 0x17 &&
+    BufPtr[3] == 0x0B;
+}
+
+inline cl_bool isRawBitcode(const unsigned char *BufPtr, const unsigned char *BufEnd)
+{
+  // These bytes sort of have a hidden message, but it's not in
+  // little-endian this time, and it's a little redundant.
+  return BufPtr != BufEnd &&
+    BufPtr[0] == 'B' &&
+    BufPtr[1] == 'C' &&
+    BufPtr[2] == 0xc0 &&
+    BufPtr[3] == 0xde;
+}
+
+#define isBitcode(BufPtr,BufEnd)  (isBitcodeWrapper(BufPtr, BufEnd) || isRawBitcode(BufPtr, BufEnd))
+
+LOCAL cl_program
+cl_program_create_from_binary(cl_context             ctx,
+                              cl_uint                num_devices,
+                              const cl_device_id *   devices,
+                              const size_t *         lengths,
+                              const unsigned char ** binaries,
+                              cl_int *               binary_status,
+                              cl_int *               errcode_ret)
+{
+  cl_program program = NULL;
+  cl_int err = CL_SUCCESS;
+
+  assert(ctx);
+  INVALID_DEVICE_IF (num_devices != 1);
+  INVALID_DEVICE_IF (devices == NULL);
+  INVALID_DEVICE_IF (devices[0] != ctx->device);
+  INVALID_VALUE_IF (binaries == NULL);
+  INVALID_VALUE_IF (lengths == NULL);
+
+  if (binaries[0] == NULL) {
+    err = CL_INVALID_VALUE;
+    if (binary_status)
+      binary_status[0] = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (lengths[0] == 0) {
+    err = CL_INVALID_VALUE;
+    if (binary_status)
+      binary_status[0] = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  program = cl_program_new(ctx);
+
+  // TODO:  Need to check the binary format here to return CL_INVALID_BINARY.
+  TRY_ALLOC(program->binary, cl_calloc(lengths[0], sizeof(char)));
+  memcpy(program->binary, binaries[0], lengths[0]);
+  program->binary_sz = lengths[0];
+  program->source_type = FROM_BINARY;
+
+  if(isBitcode((unsigned char*)program->binary+1, (unsigned char*)program->binary+program->binary_sz)) {
+    if(*program->binary == 1){
+      program->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+    }else if(*program->binary == 2){
+      program->binary_type = CL_PROGRAM_BINARY_TYPE_LIBRARY;
+    }else{
+      err= CL_INVALID_BINARY;
+      goto error;
+    }
+    program->opaque = compiler_program_new_from_llvm_binary(program->ctx->device->vendor_id, program->binary, program->binary_sz);
+
+    if (UNLIKELY(program->opaque == NULL)) {
+      err = CL_INVALID_PROGRAM;
+      goto error;
+    }
+    program->source_type = FROM_LLVM;
+  }
+  else if (*program->binary == 0) {
+    program->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+  }
+
+  if (binary_status)
+    binary_status[0] = CL_SUCCESS;
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return program;
+error:
+  cl_program_delete(program);
+  program = NULL;
+  goto exit;
+
+  return CL_SUCCESS;
+}
+
+LOCAL cl_program
+cl_program_create_with_built_in_kernles(cl_context     ctx,
+                                  cl_uint              num_devices,
+                                  const cl_device_id * devices,
+                                  const char *         kernel_names,
+                                  cl_int *             errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+
+  assert(ctx);
+  INVALID_DEVICE_IF (num_devices != 1);
+  INVALID_DEVICE_IF (devices == NULL);
+  INVALID_DEVICE_IF (devices[0] != ctx->device);
+
+  cl_int binary_status = CL_SUCCESS;
+  extern char cl_internal_built_in_kernel_str[];
+  extern size_t cl_internal_built_in_kernel_str_size;
+  char* p_built_in_kernel_str =cl_internal_built_in_kernel_str;
+
+  ctx->built_in_prgs = cl_program_create_from_binary(ctx, 1,
+                                                          &ctx->device,
+                                                          (size_t*)&cl_internal_built_in_kernel_str_size,
+                                                          (const unsigned char **)&p_built_in_kernel_str,
+                                                          &binary_status, &err);
+  if (!ctx->built_in_prgs)
+    return NULL;
+
+  err = cl_program_build(ctx->built_in_prgs, NULL);
+  if (err != CL_SUCCESS)
+    return NULL;
+
+  ctx->built_in_prgs->is_built = 1;
+
+  char delims[] = ";";
+  char* saveptr = NULL;
+  char* local_kernel_names;
+  char* kernel = NULL;
+  char* matched_kernel;
+  int i = 0;
+
+  //copy the content to local_kernel_names to protect the kernel_names.
+  TRY_ALLOC(local_kernel_names, cl_calloc(strlen(kernel_names)+1, sizeof(char) ) );
+  memcpy(local_kernel_names, kernel_names, strlen(kernel_names)+1);
+
+  kernel = strtok_r( local_kernel_names, delims , &saveptr);
+  while( kernel != NULL ) {
+    matched_kernel = strstr(ctx->device->built_in_kernels, kernel);
+    if(matched_kernel){
+      for (i = 0; i < ctx->built_in_prgs->ker_n; ++i) {
+        assert(ctx->built_in_prgs->ker[i]);
+        const char *ker_name = cl_kernel_get_name(ctx->built_in_prgs->ker[i]);
+        if (strcmp(ker_name, kernel) == 0) {
+          break;
+        }
+      }
+
+      ctx->built_in_kernels[i] = cl_program_create_kernel(ctx->built_in_prgs, kernel, NULL);
+    }
+    kernel = strtok_r((char*)saveptr , delims, &saveptr );
+  }
+
+  cl_free(local_kernel_names);
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return ctx->built_in_prgs;
+error:
+  goto exit;
+
+  return CL_SUCCESS;
+}
+
+LOCAL cl_program
+cl_program_create_from_llvm(cl_context ctx,
+                            cl_uint num_devices,
+                            const cl_device_id *devices,
+                            const char *file_name,
+                            cl_int *errcode_ret)
+{
+  cl_program program = NULL;
+  cl_int err = CL_SUCCESS;
+
+  assert(ctx);
+  INVALID_DEVICE_IF (num_devices != 1);
+  INVALID_DEVICE_IF (devices == NULL);
+  INVALID_DEVICE_IF (devices[0] != ctx->device);
+  INVALID_VALUE_IF (file_name == NULL);
+
+  program = cl_program_new(ctx);
+  program->opaque = compiler_program_new_from_llvm(ctx->device->vendor_id, file_name, NULL, NULL, program->build_log_max_sz, program->build_log, &program->build_log_sz, 1);
+  if (UNLIKELY(program->opaque == NULL)) {
+    err = CL_INVALID_PROGRAM;
+    goto error;
+  }
+
+  /* Create all the kernels */
+  TRY (cl_program_load_gen_program, program);
+  program->source_type = FROM_LLVM;
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return program;
+error:
+  cl_program_delete(program);
+  program = NULL;
+  goto exit;
+}
+
+LOCAL cl_program
+cl_program_create_from_source(cl_context ctx,
+                              cl_uint count,
+                              const char **strings,
+                              const size_t *lengths,
+                              cl_int *errcode_ret)
+
+{
+  cl_program program = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_uint i;
+  int32_t * lens = NULL;
+  int32_t len_total = 0;
+  assert(ctx);
+  char * p = NULL;
+  // the real compilation step will be done at build time since we do not have
+  // yet the compilation options
+  program = cl_program_new(ctx);
+  TRY_ALLOC (lens, cl_calloc(count, sizeof(int32_t)));
+  for (i = 0; i < (int) count; ++i) {
+    size_t len;
+    if (lengths == NULL || lengths[i] == 0)
+      len = strlen(strings[i]);
+    else
+      len = lengths[i];
+    lens[i] = len;
+    len_total += len;
+  }
+  TRY_ALLOC(program->source, cl_calloc(len_total+1, sizeof(char)));
+  p = program->source;
+  for (i = 0; i < (int) count; ++i) {
+    memcpy(p, strings[i], lens[i]);
+    p += lens[i];
+  }
+  *p = '\0';
+
+  program->source_type = FROM_SOURCE;
+  program->binary_type = CL_PROGRAM_BINARY_TYPE_NONE;
+
+exit:
+  cl_free(lens);
+  lens = NULL;
+  if (errcode_ret)
+    *errcode_ret = err;
+  return program;
+error:
+  cl_program_delete(program);
+  program = NULL;
+  goto exit;
+}
+
+/* Before we do the real work, we need to check whether our platform
+   cl version can meet -cl-std= */
+static int check_cl_version_option(cl_program p, const char* options) {
+  const char* s = NULL;
+  int ver1 = 0;
+  int ver2 = 0;
+  char version_str[64];
+
+  if (options && (s = strstr(options, "-cl-std="))) {
+
+    if (s + strlen("-cl-std=CLX.X") > options + strlen(options)) {
+      return 0;
+    }
+
+    if (s[8] != 'C' || s[9] != 'L' || s[10] > '9' || s[10] < '0' || s[11] != '.'
+        || s[12] > '9' || s[12] < '0') {
+      return 0;
+    }
+
+    ver1 = (s[10] - '0') * 10 + (s[12] - '0');
+
+    if (cl_get_device_info(p->ctx->device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version_str),
+                                  version_str, NULL) != CL_SUCCESS)
+      return 0;
+
+    assert(strstr(version_str, "OpenCL") && version_str[0] == 'O');
+    ver2 = (version_str[9] - '0') * 10 + (version_str[11] - '0');
+
+    if (ver2 < ver1)
+      return 0;
+
+    return 1;
+  }
+
+  return 1;
+}
+
+LOCAL cl_int
+cl_program_build(cl_program p, const char *options)
+{
+  cl_int err = CL_SUCCESS;
+  int i = 0;
+  int copyed = 0;
+
+  if (p->ref_n > 1) {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
+
+  if (!check_cl_version_option(p, options)) {
+    err = CL_BUILD_PROGRAM_FAILURE;
+    goto error;
+  }
+  if (options) {
+    if(p->build_opts == NULL || strcmp(options, p->build_opts) != 0) {
+      if(p->build_opts) {
+        cl_free(p->build_opts);
+        p->build_opts = NULL;
+      }
+      TRY_ALLOC (p->build_opts, cl_calloc(strlen(options) + 1, sizeof(char)));
+      memcpy(p->build_opts, options, strlen(options));
+
+      p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
+    }
+  }
+
+  if (options == NULL && p->build_opts) {
+    p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
+
+    cl_free(p->build_opts);
+    p->build_opts = NULL;
+  }
+
+  if (p->source_type == FROM_SOURCE) {
+    if (!CompilerSupported()) {
+      err = CL_COMPILER_NOT_AVAILABLE;
+      goto error;
+    }
+
+    p->opaque = compiler_program_new_from_source(p->ctx->device->vendor_id, p->source, p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
+    if (UNLIKELY(p->opaque == NULL)) {
+      if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
+        err = CL_INVALID_BUILD_OPTIONS;
+      else
+        err = CL_BUILD_PROGRAM_FAILURE;
+      goto error;
+    }
+
+    /* Create all the kernels */
+    TRY (cl_program_load_gen_program, p);
+  } else if (p->source_type == FROM_LLVM) {
+    if (!CompilerSupported()) {
+      err = CL_COMPILER_NOT_AVAILABLE;
+      goto error;
+    }
+
+    compiler_program_build_from_llvm(p->opaque, p->build_log_max_sz, p->build_log, &p->build_log_sz, options);
+    if (UNLIKELY(p->opaque == NULL)) {
+      if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
+        err = CL_INVALID_BUILD_OPTIONS;
+      else
+        err = CL_BUILD_PROGRAM_FAILURE;
+      goto error;
+    }
+    /* Create all the kernels */
+    TRY (cl_program_load_gen_program, p);
+  } else if (p->source_type == FROM_BINARY) {
+    p->opaque = interp_program_new_from_binary(p->ctx->device->vendor_id, p->binary, p->binary_sz);
+    if (UNLIKELY(p->opaque == NULL)) {
+      err = CL_BUILD_PROGRAM_FAILURE;
+      goto error;
+    }
+
+    /* Create all the kernels */
+    TRY (cl_program_load_gen_program, p);
+  }
+  p->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+
+  for (i = 0; i < p->ker_n; i ++) {
+    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
+    p->bin_sz += interp_kernel_get_code_size(opaque);
+  }
+
+  TRY_ALLOC (p->bin, cl_calloc(p->bin_sz, sizeof(char)));
+  for (i = 0; i < p->ker_n; i ++) {
+    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
+    size_t sz = interp_kernel_get_code_size(opaque);
+
+    memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
+    copyed += sz;
+  }
+  p->is_built = 1;
+  p->build_status = CL_BUILD_SUCCESS;
+  return CL_SUCCESS;
+
+error:
+  p->build_status = CL_BUILD_ERROR;
+  return err;
+}
+
+cl_program
+cl_program_link(cl_context            context,
+                cl_uint               num_input_programs,
+                const cl_program *    input_programs,
+                const char *          options,
+                cl_int*               errcode_ret)
+{
+  cl_program p = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_int i = 0;
+  int copyed = 0;
+  p = cl_program_new(context);
+
+  if (!check_cl_version_option(p, options)) {
+    err = CL_BUILD_PROGRAM_FAILURE;
+    goto error;
+  }
+
+  p->opaque = compiler_program_new_gen_program(context->device->vendor_id, NULL, NULL);
+
+  for(i = 0; i < num_input_programs; i++) {
+    // if program create with llvm binary, need deserilize first to get module.
+    if(input_programs[i])
+      compiler_program_link_program(p->opaque, input_programs[i]->opaque,
+        p->build_log_max_sz, p->build_log, &p->build_log_sz);
+    if (UNLIKELY(p->opaque == NULL)) {
+      err = CL_LINK_PROGRAM_FAILURE;
+      goto error;
+    }
+  }
+
+  if(options && strstr(options, "-create-library")){
+    p->binary_type = CL_PROGRAM_BINARY_TYPE_LIBRARY;
+    goto done;
+  }else{
+    p->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+  }
+
+  compiler_program_build_from_llvm(p->opaque, p->build_log_max_sz, p->build_log, &p->build_log_sz, options);
+
+  /* Create all the kernels */
+  TRY (cl_program_load_gen_program, p);
+
+  for (i = 0; i < p->ker_n; i ++) {
+    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
+    p->bin_sz += interp_kernel_get_code_size(opaque);
+  }
+
+  TRY_ALLOC (p->bin, cl_calloc(p->bin_sz, sizeof(char)));
+  for (i = 0; i < p->ker_n; i ++) {
+    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
+    size_t sz = interp_kernel_get_code_size(opaque);
+
+    memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
+    copyed += sz;
+  }
+done:
+  p->is_built = 1;
+  p->build_status = CL_BUILD_SUCCESS;
+  if (errcode_ret)
+    *errcode_ret = err;
+  return p;
+
+error:
+  p->build_status = CL_BUILD_ERROR;
+  if (errcode_ret)
+    *errcode_ret = err;
+  return p;
+}
+
+LOCAL cl_int
+cl_program_compile(cl_program            p,
+                   cl_uint               num_input_headers,
+                   const cl_program *    input_headers,
+                   const char **         header_include_names,
+                   const char*           options)
+{
+  cl_int err = CL_SUCCESS;
+  int i = 0;
+
+  if (p->ref_n > 1) {
+    err = CL_INVALID_OPERATION;
+    goto error;
+  }
+
+  if (!check_cl_version_option(p, options)) {
+    err = CL_BUILD_PROGRAM_FAILURE;
+    goto error;
+  }
+
+  if (options) {
+    if(p->build_opts == NULL || strcmp(options, p->build_opts) != 0) {
+      if(p->build_opts) {
+        cl_free(p->build_opts);
+        p->build_opts = NULL;
+      }
+      TRY_ALLOC (p->build_opts, cl_calloc(strlen(options) + 1, sizeof(char)));
+      memcpy(p->build_opts, options, strlen(options));
+
+      p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
+    }
+  }
+
+  if (options == NULL && p->build_opts) {
+    p->source_type = p->source ? FROM_SOURCE : p->binary ? FROM_BINARY : FROM_LLVM;
+
+    cl_free(p->build_opts);
+    p->build_opts = NULL;
+  }
+
+  char temp_header_template[]= "/tmp/beignet.XXXXXX";
+  char* temp_header_path = mkdtemp(temp_header_template);
+
+  if (p->source_type == FROM_SOURCE) {
+
+    if (!CompilerSupported()) {
+      err = CL_COMPILER_NOT_AVAILABLE;
+      goto error;
+    }
+
+    //write the headers to /tmp/beignet.XXXXXX for include.
+    for (i = 0; i < num_input_headers; i++) {
+      if(header_include_names[i] == NULL || input_headers[i] == NULL)
+        continue;
+
+      char temp_path[255]="";
+      strncpy(temp_path, temp_header_path, strlen(temp_header_path));
+      strncat(temp_path, "/", 1);
+      strncat(temp_path, header_include_names[i], strlen(header_include_names[i]));
+      char* dirc = strdup(temp_path);
+      char* dir = dirname(dirc);
+      mkdir(dir, 0755);
+      if(access(dir, R_OK|W_OK) != 0){
+        err = CL_COMPILE_PROGRAM_FAILURE;
+        goto error;
+      }
+      free(dirc);
+
+      FILE* pfile = fopen(temp_path, "wb");
+      if(pfile){
+        fwrite(input_headers[i]->source, strlen(input_headers[i]->source), 1, pfile);
+        fclose(pfile);
+      }else{
+        err = CL_COMPILE_PROGRAM_FAILURE;
+        goto error;
+      }
+    }
+
+    p->opaque = compiler_program_compile_from_source(p->ctx->device->vendor_id, p->source, temp_header_path,
+        p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
+
+    char rm_path[255]="rm ";
+    strncat(rm_path, temp_header_path, strlen(temp_header_path));
+    strncat(rm_path, " -rf", 4);
+    int temp = system(rm_path);
+
+    if(temp){
+      assert(0);
+    }
+
+    if (UNLIKELY(p->opaque == NULL)) {
+      if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
+        err = CL_INVALID_BUILD_OPTIONS;
+      else
+        err = CL_BUILD_PROGRAM_FAILURE;
+      goto error;
+    }
+
+    /* Create all the kernels */
+    p->source_type = FROM_LLVM;
+    p->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+  }
+  p->is_built = 1;
+  p->build_status = CL_BUILD_SUCCESS;
+  return CL_SUCCESS;
+
+error:
+  p->build_status = CL_BUILD_ERROR;
+  cl_program_delete(p);
+  p = NULL;
+  return err;
+}
+
+LOCAL cl_kernel
+cl_program_create_kernel(cl_program p, const char *name, cl_int *errcode_ret)
+{
+  cl_kernel from = NULL, to = NULL;
+  cl_int err = CL_SUCCESS;
+  uint32_t i = 0;
+
+  /* Find the program first */
+  for (i = 0; i < p->ker_n; ++i) {
+    assert(p->ker[i]);
+    const char *ker_name = cl_kernel_get_name(p->ker[i]);
+    if (strcmp(ker_name, name) == 0) {
+      from = p->ker[i];
+      break;
+    }
+  }
+
+  /* We were not able to find this named kernel */
+  if (UNLIKELY(from == NULL)) {
+    err = CL_INVALID_KERNEL_NAME;
+    goto error;
+  }
+
+  TRY_ALLOC(to, cl_kernel_dup(from));
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return to;
+error:
+  cl_kernel_delete(to);
+  to = NULL;
+  goto exit;
+}
+
+LOCAL cl_int
+cl_program_create_kernels_in_program(cl_program p, cl_kernel* ker)
+{
+  int i = 0;
+
+  if(ker == NULL)
+    return CL_SUCCESS;
+
+  for (i = 0; i < p->ker_n; ++i) {
+    TRY_ALLOC_NO_ERR(ker[i], cl_kernel_dup(p->ker[i]));
+  }
+
+  return CL_SUCCESS;
+
+error:
+  do {
+    cl_kernel_delete(ker[i]);
+    ker[i--] = NULL;
+  } while(i > 0);
+
+  return CL_OUT_OF_HOST_MEMORY;
+}
+
+LOCAL void
+cl_program_get_kernel_names(cl_program p, size_t size, char *names, size_t *size_ret)
+{
+  int i = 0;
+  const char *ker_name = NULL;
+  size_t len = 0;
+  if(size_ret) *size_ret = 0;
+
+  if(p->ker == NULL) {
+    return;
+  }
+
+  ker_name = cl_kernel_get_name(p->ker[i]);
+  len = strlen(ker_name);
+  if(names) {
+    strncpy(names, cl_kernel_get_name(p->ker[0]), size - 1);
+    if(size < len - 1) {
+      if(size_ret) *size_ret = size;
+      return;
+    }
+    size = size - len - 1;  //sub \0
+  }
+  if(size_ret) *size_ret = strlen(ker_name) + 1;  //add NULL
+
+  for (i = 1; i < p->ker_n; ++i) {
+    ker_name = cl_kernel_get_name(p->ker[i]);
+    len = strlen(ker_name);
+    if(names) {
+      strncat(names, ";", size);
+      if(size >= 1)
+        strncat(names, ker_name, size - 1);
+      if(size < len + 1) {
+        if(size_ret) *size_ret = size;
+        break;
+      }
+      size = size - len - 1;
+    }
+    if(size_ret) *size_ret += len + 1; //add ';'
+  }
+}
diff --git a/src/cl_program.h b/src/cl_program.h
new file mode 100644
index 0000000..6dea29a
--- /dev/null
+++ b/src/cl_program.h
@@ -0,0 +1,136 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_PROGRAM_H__
+#define __CL_PROGRAM_H__
+
+#include "cl_internals.h"
+#include "cl_gbe_loader.h"
+#include "CL/cl.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+// This is the structure ouput by the compiler
+struct _gbe_program;
+
+enum {
+  FROM_SOURCE = 0,
+  FROM_LLVM = 1,
+  FROM_BINARY = 2
+};
+
+/* This maps an OCL file containing some kernels */
+struct _cl_program {
+  DEFINE_ICD(dispatch)
+  uint64_t magic;         /* To identify it as a program */
+  volatile int ref_n;     /* We reference count this object */
+  gbe_program opaque;     /* (Opaque) program as ouput by the compiler */
+  cl_kernel *ker;         /* All kernels included by the OCL file */
+  cl_program prev, next;  /* We chain the programs together */
+  cl_context ctx;         /* Its parent context */
+  char *bin;              /* The program copied verbatim */
+  size_t bin_sz;          /* Its size in memory */
+  char *source;           /* Program sources */
+  char *binary;           /* Program binary. */
+  size_t binary_sz;       /* The binary size. */
+  uint32_t binary_type;   /* binary type: COMPILED_OBJECT(LLVM IR), LIBRARY(LLVM IR with option "-create-library"), or EXECUTABLE(GEN binary). */
+  uint32_t ker_n;         /* Number of declared kernels */
+  uint32_t source_type:2; /* Built from binary, source or LLVM */
+  uint32_t is_built:1;    /* Did we call clBuildProgram on it? */
+  int32_t build_status;   /* build status. */
+  char *build_opts;       /* The build options for this program */
+  size_t build_log_max_sz; /*build log maximum size in byte.*/
+  char *build_log;         /* The build log for this program. */
+  size_t build_log_sz;    /* The actual build log size.*/
+};
+
+/* Create a empty program */
+extern cl_program cl_program_new(cl_context);
+
+/* Destroy and deallocate an empty kernel */
+extern void cl_program_delete(cl_program);
+
+/* Add one more reference to the object (to defer its deletion) */
+extern void cl_program_add_ref(cl_program);
+
+/* Create a kernel for the OCL user */
+extern cl_kernel cl_program_create_kernel(cl_program, const char*, cl_int*);
+
+/* creates kernel objects for all kernel functions in program. */
+extern cl_int cl_program_create_kernels_in_program(cl_program, cl_kernel*);
+
+/* Create a program from OCL source */
+extern cl_program
+cl_program_create_from_source(cl_context ctx,
+                              cl_uint count,
+                              const char **strings,
+                              const size_t *lengths,
+                              cl_int *errcode_ret);
+
+/* Directly create a program from a blob */
+extern cl_program
+cl_program_create_from_binary(cl_context             context,
+                              cl_uint                num_devices,
+                              const cl_device_id *   devices,
+                              const size_t *         lengths,
+                              const unsigned char ** binaries,
+                              cl_int *               binary_status,
+                              cl_int *               errcode_ret);
+
+/* Create a program with built-in kernels*/
+extern cl_program
+cl_program_create_with_built_in_kernles(cl_context     context,
+                                  cl_uint              num_devices,
+                                  const cl_device_id * device_list,
+                                  const char *         kernel_names,
+                                  cl_int *             errcode_ret);
+/* Directly create a program from a LLVM source file */
+extern cl_program
+cl_program_create_from_llvm(cl_context             context,
+                            cl_uint                num_devices,
+                            const cl_device_id *   devices,
+                            const char *           fileName,
+                            cl_int *               errcode_ret);
+
+/* Build the program as specified by OCL */
+extern cl_int
+cl_program_build(cl_program p, const char* options);
+/* Compile the program as specified by OCL */
+extern cl_int
+cl_program_compile(cl_program            p,
+                   cl_uint               num_input_headers,
+                   const cl_program *    input_headers,
+                   const char **         header_include_names,
+                   const char*           options);
+/* link the program as specified by OCL */
+extern cl_program
+cl_program_link(cl_context            context,
+                cl_uint               num_input_programs,
+                const cl_program *    input_programs,
+                const char *          options,
+                cl_int*               errcode_ret);
+/* Get the kernel names in program */
+extern void
+cl_program_get_kernel_names(cl_program p,
+                            size_t size,
+                            char *names,
+                            size_t *size_ret);
+#endif /* __CL_PROGRAM_H__ */
+
diff --git a/src/cl_sampler.c b/src/cl_sampler.c
new file mode 100644
index 0000000..d718256
--- /dev/null
+++ b/src/cl_sampler.c
@@ -0,0 +1,142 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "cl_context.h"
+#include "cl_sampler.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+
+#include <assert.h>
+
+uint32_t cl_to_clk(cl_bool normalized_coords,
+                   cl_addressing_mode address,
+                   cl_filter_mode filter)
+{
+  int clk_address = CLK_ADDRESS_NONE;
+  int clk_filter = CLK_FILTER_NEAREST;
+  switch (address) {
+  case CL_ADDRESS_NONE: clk_address = CLK_ADDRESS_NONE; break;
+  case CL_ADDRESS_CLAMP: clk_address = CLK_ADDRESS_CLAMP; break;
+  case CL_ADDRESS_CLAMP_TO_EDGE: clk_address = CLK_ADDRESS_CLAMP_TO_EDGE; break;
+  case CL_ADDRESS_REPEAT: clk_address = CLK_ADDRESS_REPEAT; break;
+  case CL_ADDRESS_MIRRORED_REPEAT: clk_address = CLK_ADDRESS_MIRRORED_REPEAT; break;
+  default:
+    assert(0);
+  }
+  switch(filter) {
+  case CL_FILTER_NEAREST: clk_filter = CLK_FILTER_NEAREST; break;
+  case CL_FILTER_LINEAR: clk_filter = CLK_FILTER_LINEAR; break;
+  default:
+    assert(0);
+  }
+  return (clk_address << __CLK_ADDRESS_BASE)
+         | (normalized_coords << __CLK_NORMALIZED_BASE)
+         | (clk_filter);
+}
+
+#define IS_SAMPLER_ARG(v) (v & __CLK_SAMPLER_ARG_KEY_BIT)
+#define SAMPLER_ARG_ID(v) ((v & __CLK_SAMPLER_ARG_MASK) >> __CLK_SAMPLER_ARG_BASE)
+int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler)
+{
+  int slot_id;
+  for(slot_id = 0; slot_id < k->sampler_sz; slot_id++)
+  {
+    if (IS_SAMPLER_ARG(k->samplers[slot_id])) {
+     if (SAMPLER_ARG_ID(k->samplers[slot_id]) == index) {
+       k->samplers[slot_id] = (k->samplers[slot_id] & (~__CLK_SAMPLER_MASK))
+                              | sampler->clkSamplerValue;
+       return slot_id;
+     }
+    }
+  }
+  return -1;
+}
+
+LOCAL cl_sampler
+cl_sampler_new(cl_context ctx,
+               cl_bool normalized_coords,
+               cl_addressing_mode address,
+               cl_filter_mode filter,
+               cl_int *errcode_ret)
+{
+  cl_sampler sampler = NULL;
+  cl_int err = CL_SUCCESS;
+
+  /* Allocate and inialize the structure itself */
+  TRY_ALLOC (sampler, CALLOC(struct _cl_sampler));
+  SET_ICD(sampler->dispatch)
+  sampler->ref_n = 1;
+  sampler->magic = CL_MAGIC_SAMPLER_HEADER;
+  sampler->normalized_coords = normalized_coords;
+  sampler->address = address;
+  sampler->filter = filter;
+
+  /* Append the sampler in the context sampler list */
+  pthread_mutex_lock(&ctx->sampler_lock);
+    sampler->next = ctx->samplers;
+    if (ctx->samplers != NULL)
+      ctx->samplers->prev = sampler;
+    ctx->samplers = sampler;
+  pthread_mutex_unlock(&ctx->sampler_lock);
+  sampler->ctx = ctx;
+  cl_context_add_ref(ctx);
+
+  sampler->clkSamplerValue = cl_to_clk(normalized_coords, address, filter);
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return sampler;
+error:
+  cl_sampler_delete(sampler);
+  sampler = NULL;
+  goto exit;
+}
+
+LOCAL void
+cl_sampler_delete(cl_sampler sampler)
+{
+  if (UNLIKELY(sampler == NULL))
+    return;
+  if (atomic_dec(&sampler->ref_n) > 1)
+    return;
+
+  assert(sampler->ctx);
+  pthread_mutex_lock(&sampler->ctx->sampler_lock);
+    if (sampler->prev)
+      sampler->prev->next = sampler->next;
+    if (sampler->next)
+      sampler->next->prev = sampler->prev;
+    if (sampler->ctx->samplers == sampler)
+      sampler->ctx->samplers = sampler->next;
+  pthread_mutex_unlock(&sampler->ctx->sampler_lock);
+  cl_context_delete(sampler->ctx);
+
+  cl_free(sampler);
+}
+
+LOCAL void
+cl_sampler_add_ref(cl_sampler sampler)
+{
+  assert(sampler);
+  atomic_inc(&sampler->ref_n);
+}
+
diff --git a/src/cl_sampler.h b/src/cl_sampler.h
new file mode 100644
index 0000000..4785928
--- /dev/null
+++ b/src/cl_sampler.h
@@ -0,0 +1,57 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_SAMPLER_H__
+#define __CL_SAMPLER_H__
+
+#include "CL/cl.h"
+#include "../backend/src/ocl_common_defines.h"
+#include <stdint.h>
+
+/* How to access images */
+struct _cl_sampler {
+  DEFINE_ICD(dispatch)
+  uint64_t magic;            /* To identify it as a sampler object */
+  volatile int ref_n;        /* This object is reference counted */
+  cl_sampler prev, next;     /* We chain the samplers in the allocator */
+  cl_context ctx;            /* Context it belongs to */
+  cl_bool normalized_coords; /* Are coordinates normalized? */
+  cl_addressing_mode address;/* CLAMP / REPEAT and so on... */
+  cl_filter_mode filter;     /* LINEAR / NEAREST mostly */
+  uint32_t clkSamplerValue;
+};
+
+/* Create a new sampler object */
+extern cl_sampler cl_sampler_new(cl_context,
+                                 cl_bool,
+                                 cl_addressing_mode,
+                                 cl_filter_mode,
+                                 cl_int *err);
+
+/* Unref the object and delete it if no more reference on it */
+extern void cl_sampler_delete(cl_sampler);
+
+/* Add one more reference to this object */
+extern void cl_sampler_add_ref(cl_sampler);
+
+/* set a sampler kernel argument */
+int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler);
+
+#endif /* __CL_SAMPLER_H__ */
+
diff --git a/src/cl_thread.c b/src/cl_thread.c
new file mode 100644
index 0000000..5713d70
--- /dev/null
+++ b/src/cl_thread.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include <string.h>
+#include <stdio.h>
+
+#include "cl_thread.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+
+/* Because the cl_command_queue can be used in several threads simultaneously but
+   without add ref to it, we now handle it like this:
+   Keep one threads_slot_array, every time the thread get gpgpu or batch buffer, if it
+   does not have a slot, assign it.
+   The resources are keeped in queue private, and resize it if needed.
+   When the thread exit, the slot will be set invalid.
+   When queue released, all the resources will be released. If user still enqueue, flush
+   or finish the queue after it has been released, the behavior is undefined.
+   TODO: Need to shrink the slot map.
+   */
+
+static int thread_array_num = 1;
+static int *thread_slot_map = NULL;
+static int thread_magic_num = 1;
+static pthread_mutex_t thread_queue_map_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_key_t destroy_key;
+
+static __thread int thread_id = -1;
+static __thread int thread_magic = -1;
+
+typedef struct _thread_spec_data {
+  cl_gpgpu gpgpu ;
+  int valid;
+  void* thread_batch_buf;
+  int thread_magic;
+} thread_spec_data;
+
+typedef struct _queue_thread_private {
+  thread_spec_data**  threads_data;
+  int threads_data_num;
+  pthread_mutex_t thread_data_lock;
+} queue_thread_private;
+
+static void thread_data_destructor(void *dummy) {
+  pthread_mutex_lock(&thread_queue_map_lock);
+  thread_slot_map[thread_id] = 0;
+  pthread_mutex_unlock(&thread_queue_map_lock);
+  free(dummy);
+}
+
+static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int create)
+{
+  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+  thread_spec_data* spec = NULL;
+  int i = 0;
+
+  if (thread_id == -1) {
+    void * dummy = malloc(sizeof(int));
+
+    pthread_mutex_lock(&thread_queue_map_lock);
+    for (i = 0; i < thread_array_num; i++) {
+      if (thread_slot_map[i] == 0) {
+        thread_id = i;
+        break;
+      }
+    }
+
+    if (i == thread_array_num) {
+      thread_array_num *= 2;
+      thread_slot_map = realloc(thread_slot_map, sizeof(int) * thread_array_num);
+      memset(thread_slot_map + thread_array_num/2, 0, sizeof(int) * (thread_array_num/2));
+      thread_id = thread_array_num/2;
+    }
+
+    thread_slot_map[thread_id] = 1;
+
+    thread_magic = thread_magic_num++;
+    pthread_mutex_unlock(&thread_queue_map_lock);
+
+    pthread_setspecific(destroy_key, dummy);
+  }
+
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  if (thread_array_num > thread_private->threads_data_num) {// just enlarge
+    int old_num = thread_private->threads_data_num;
+    thread_private->threads_data_num = thread_array_num;
+    thread_private->threads_data = realloc(thread_private->threads_data,
+                thread_private->threads_data_num * sizeof(void *));
+    memset(thread_private->threads_data + old_num, 0,
+           sizeof(void*) * (thread_private->threads_data_num - old_num));
+  }
+
+  assert(thread_id != -1 && thread_id < thread_array_num);
+  spec = thread_private->threads_data[thread_id];
+  if (!spec && create) {
+       spec = CALLOC(thread_spec_data);
+       spec->thread_magic = thread_magic;
+       thread_private->threads_data[thread_id] = spec;
+  }
+
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+  return spec;
+}
+
+void* cl_thread_data_create(void)
+{
+  queue_thread_private* thread_private = CALLOC(queue_thread_private);
+
+  if (thread_private == NULL)
+    return NULL;
+
+  if (thread_slot_map == NULL) {
+    pthread_mutex_lock(&thread_queue_map_lock);
+    thread_slot_map = calloc(thread_array_num, sizeof(int));
+    pthread_mutex_unlock(&thread_queue_map_lock);
+
+    pthread_key_create(&destroy_key, thread_data_destructor);
+  }
+
+  pthread_mutex_init(&thread_private->thread_data_lock, NULL);
+
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  thread_private->threads_data = malloc(thread_array_num * sizeof(void *));
+  memset(thread_private->threads_data, 0, sizeof(void*) * thread_array_num);
+  thread_private->threads_data_num = thread_array_num;
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+  return thread_private;
+}
+
+cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue)
+{
+  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
+
+  if (!spec->thread_magic && spec->thread_magic != thread_magic) {
+    //We may get the slot from last thread. So free the resource.
+    spec->valid = 0;
+  }
+
+  if (!spec->valid) {
+    if (spec->thread_batch_buf) {
+      cl_gpgpu_unref_batch_buf(spec->thread_batch_buf);
+      spec->thread_batch_buf = NULL;
+    }
+    if (spec->gpgpu) {
+      cl_gpgpu_delete(spec->gpgpu);
+      spec->gpgpu = NULL;
+    }
+    TRY_ALLOC_NO_ERR(spec->gpgpu, cl_gpgpu_new(queue->ctx->drv));
+    spec->valid = 1;
+  }
+
+ error:
+  return spec->gpgpu;
+}
+
+void cl_set_thread_batch_buf(cl_command_queue queue, void* buf)
+{
+  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
+
+  assert(spec && spec->thread_magic == thread_magic);
+
+  if (spec->thread_batch_buf) {
+    cl_gpgpu_unref_batch_buf(spec->thread_batch_buf);
+  }
+  spec->thread_batch_buf = buf;
+}
+
+void* cl_get_thread_batch_buf(cl_command_queue queue) {
+  thread_spec_data* spec = __create_thread_spec_data(queue, 1);
+
+  assert(spec && spec->thread_magic == thread_magic);
+
+  return spec->thread_batch_buf;
+}
+
+void cl_invalid_thread_gpgpu(cl_command_queue queue)
+{
+  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+  thread_spec_data* spec = NULL;
+
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  spec = thread_private->threads_data[thread_id];
+  assert(spec);
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+  if (!spec->valid) {
+    return;
+  }
+
+  assert(spec->gpgpu);
+  cl_gpgpu_delete(spec->gpgpu);
+  spec->gpgpu = NULL;
+  spec->valid = 0;
+}
+
+cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue)
+{
+  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+  thread_spec_data* spec = NULL;
+
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  spec = thread_private->threads_data[thread_id];
+  assert(spec);
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+
+  if (!spec->valid)
+    return NULL;
+
+  assert(spec->gpgpu);
+  cl_gpgpu gpgpu = spec->gpgpu;
+  spec->gpgpu = NULL;
+  spec->valid = 0;
+  return gpgpu;
+}
+
+/* The destructor for clean the thread specific data. */
+void cl_thread_data_destroy(cl_command_queue queue)
+{
+  int i = 0;
+  queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
+  int threads_data_num;
+  thread_spec_data** threads_data;
+
+  pthread_mutex_lock(&thread_private->thread_data_lock);
+  assert(thread_private->threads_data_num == thread_array_num);
+  threads_data_num = thread_private->threads_data_num;
+  threads_data = thread_private->threads_data;
+  thread_private->threads_data_num = 0;
+  thread_private->threads_data = NULL;
+  pthread_mutex_unlock(&thread_private->thread_data_lock);
+  cl_free(thread_private);
+  queue->thread_data = NULL;
+
+  for (i = 0; i < threads_data_num; i++) {
+    if (threads_data[i] != NULL && threads_data[i]->thread_batch_buf) {
+      cl_gpgpu_unref_batch_buf(threads_data[i]->thread_batch_buf);
+      threads_data[i]->thread_batch_buf = NULL;
+    }
+
+    if (threads_data[i] != NULL && threads_data[i]->valid) {
+      cl_gpgpu_delete(threads_data[i]->gpgpu);
+      threads_data[i]->gpgpu = NULL;
+      threads_data[i]->valid = 0;
+    }
+    cl_free(threads_data[i]);
+  }
+
+  cl_free(threads_data);
+}
diff --git a/src/cl_thread.h b/src/cl_thread.h
new file mode 100644
index 0000000..ecc99ad
--- /dev/null
+++ b/src/cl_thread.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef __CL_THREAD_H__
+#define __CL_THREAD_H__
+
+#include <pthread.h>
+#include "cl_internals.h"
+#include "cl_command_queue.h"
+
+/* Create the thread specific data. */
+void* cl_thread_data_create(void);
+
+/* The destructor for clean the thread specific data. */
+void cl_thread_data_destroy(cl_command_queue queue);
+
+/* Used to get the gpgpu struct of each thread. */
+cl_gpgpu cl_get_thread_gpgpu(cl_command_queue queue);
+
+/* Used to release the gpgpu struct of each thread. */
+void cl_invalid_thread_gpgpu(cl_command_queue queue);
+
+/* Used to set the batch buffer of each thread. */
+void cl_set_thread_batch_buf(cl_command_queue queue, void* buf);
+
+/* Used to get the batch buffer of each thread. */
+void* cl_get_thread_batch_buf(cl_command_queue queue);
+
+/* take current gpgpu from the thread gpgpu pool. */
+cl_gpgpu cl_thread_gpgpu_take(cl_command_queue queue);
+
+#endif /* __CL_THREAD_H__ */
diff --git a/src/cl_utils.h b/src/cl_utils.h
new file mode 100644
index 0000000..26cf329
--- /dev/null
+++ b/src/cl_utils.h
@@ -0,0 +1,316 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __CL_UTILS_H__
+#define __CL_UTILS_H__
+
+/* INLINE is forceinline */
+#define INLINE __attribute__((always_inline)) inline
+
+/* Branch hint */
+#define LIKELY(x)       __builtin_expect((x),1)
+#define UNLIKELY(x)     __builtin_expect((x),0)
+
+/* Stringify macros */
+#define JOIN(X, Y) _DO_JOIN(X, Y)
+#define _DO_JOIN(X, Y) _DO_JOIN2(X, Y)
+#define _DO_JOIN2(X, Y) X##Y
+
+/* Check compile time errors */
+#define STATIC_ASSERT(value)                                        \
+struct JOIN(__,JOIN(__,__LINE__)) {                                 \
+  int x[(value) ? 1 : -1];                                          \
+}
+
+/* Throw errors */
+#ifdef NDEBUG
+  #define ERR(ERROR, ...)                                             \
+  do {                                                                \
+    err = ERROR;                                                      \
+    goto error;                                                       \
+  } while (0)
+#else
+  #define ERR(ERROR, ...)                                             \
+  do {                                                                \
+    fprintf(stderr, "error in %s line %i\n", __FILE__, __LINE__);     \
+    fprintf(stderr, __VA_ARGS__);                                     \
+    fprintf(stderr, "\n");                                            \
+    err = ERROR;                                                      \
+    goto error;                                                       \
+  } while (0)
+#endif
+
+#define DO_ALLOC_ERR                                                \
+do {                                                                \
+  ERR(CL_OUT_OF_HOST_MEMORY, "Out of memory");                      \
+} while (0)
+
+#define ERR_IF(COND, ERROR, ...)                                    \
+do {                                                                \
+  if (UNLIKELY(COND)) ERR (ERROR, __VA_ARGS__);                     \
+} while (0)
+
+#define INVALID_VALUE_IF(COND)                                      \
+do {                                                                \
+  ERR_IF(COND, CL_INVALID_VALUE, "Invalid value");                  \
+} while (0)
+
+#define INVALID_DEVICE_IF(COND)                                     \
+do {                                                                \
+  ERR_IF(COND, CL_INVALID_DEVICE, "Invalid device");                \
+} while (0)
+
+#define MAX(x0, x1) ((x0) > (x1) ? (x0) : (x1))
+#define MIN(x0, x1) ((x0) < (x1) ? (x0) : (x1))
+#define ALIGN(A, B) (((A) % (B)) ? (A) + (B) - ((A) % (B)) : (A))
+
+#define DO_ALLOC_ERROR                                      \
+do {                                                        \
+  err = CL_OUT_OF_HOST_MEMORY;                              \
+  goto error;                                               \
+} while (0)
+
+#define FATAL(...)                                          \
+do {                                                        \
+  fprintf(stderr, "error: ");                               \
+  fprintf(stderr, __VA_ARGS__);                             \
+  fprintf(stderr, "\n");                                    \
+  assert(0);                                                \
+  exit(-1);                                                 \
+} while (0)
+
+#define FATAL_IF(COND, ...)                                 \
+do {                                                        \
+  if (UNLIKELY(COND)) FATAL(__VA_ARGS__);                   \
+} while (0)
+
+#define NOT_IMPLEMENTED FATAL ("Not implemented")
+
+#define CHECK_CONTEXT(CTX)                                  \
+do {                                                        \
+  if (UNLIKELY(CTX == NULL)) {                              \
+    err = CL_INVALID_CONTEXT;                               \
+    goto error;                                             \
+  }                                                         \
+  if (UNLIKELY(CTX->magic != CL_MAGIC_CONTEXT_HEADER)) {    \
+    err = CL_INVALID_CONTEXT;                               \
+    goto error;                                             \
+  }                                                         \
+} while (0)
+
+#define CHECK_QUEUE(QUEUE)                                  \
+do {                                                        \
+  if (UNLIKELY(QUEUE == NULL)) {                            \
+    err = CL_INVALID_COMMAND_QUEUE;                         \
+    goto error;                                             \
+  }                                                         \
+  if (UNLIKELY(QUEUE->magic != CL_MAGIC_QUEUE_HEADER)) {    \
+    err = CL_INVALID_COMMAND_QUEUE;                         \
+    goto error;                                             \
+  }                                                         \
+} while (0)
+
+#define CHECK_MEM(MEM)                                      \
+do {                                                        \
+  if (UNLIKELY(MEM == NULL)) {                              \
+    err = CL_INVALID_MEM_OBJECT;                            \
+    goto error;                                             \
+  }                                                         \
+  if (UNLIKELY(MEM->magic != CL_MAGIC_MEM_HEADER)) {        \
+    err = CL_INVALID_MEM_OBJECT;                            \
+    goto error;                                             \
+  }                                                         \
+} while (0)
+
+#define CHECK_IMAGE(MEM, IMAGE)                             \
+CHECK_MEM(MEM);                                             \
+do {                                                        \
+  if (UNLIKELY(!IS_IMAGE(MEM))) {                           \
+    err = CL_INVALID_MEM_OBJECT;                            \
+    goto error;                                             \
+  }                                                         \
+} while (0);                                                \
+struct _cl_mem_image *IMAGE;                                \
+IMAGE = cl_mem_image(MEM);                                  \
+
+#define FIXUP_IMAGE_REGION(IMAGE, PREGION, REGION)          \
+const size_t *REGION;                                       \
+size_t REGION ##_REC[3];                                    \
+do {                                                        \
+  if (IMAGE->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {   \
+    REGION ##_REC[0] = PREGION[0];                          \
+    REGION ##_REC[1] = 1;                                   \
+    REGION ##_REC[2] = PREGION[1];                          \
+    REGION = REGION ##_REC;                                 \
+  } else {                                                  \
+    REGION = PREGION;                                       \
+  }                                                         \
+} while(0)
+
+#define FIXUP_IMAGE_ORIGIN(IMAGE, PREGION, REGION)          \
+const size_t *REGION;                                       \
+size_t REGION ##_REC[3];                                    \
+do {                                                        \
+  if (IMAGE->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {   \
+    REGION ##_REC[0] = PREGION[0];                          \
+    REGION ##_REC[1] = 0;                                   \
+    REGION ##_REC[2] = PREGION[1];                          \
+    REGION = REGION ##_REC;                                 \
+  } else {                                                  \
+    REGION = PREGION;                                       \
+  }                                                         \
+} while(0)
+
+
+#define CHECK_EVENT(EVENT)                                    \
+  do {                                                        \
+    if (UNLIKELY(EVENT == NULL)) {                            \
+      err = CL_INVALID_EVENT;                                 \
+      goto error;                                             \
+    }                                                         \
+    if (UNLIKELY(EVENT->magic != CL_MAGIC_EVENT_HEADER)) {    \
+      err = CL_INVALID_EVENT;                                 \
+      goto error;                                             \
+    }                                                         \
+  } while (0)
+
+#define CHECK_SAMPLER(SAMPLER)                              \
+do {                                                        \
+  if (UNLIKELY(SAMPLER == NULL)) {                          \
+    err = CL_INVALID_SAMPLER;                               \
+    goto error;                                             \
+  }                                                         \
+  if (UNLIKELY(SAMPLER->magic != CL_MAGIC_SAMPLER_HEADER)) {\
+    err = CL_INVALID_SAMPLER;                               \
+    goto error;                                             \
+  }                                                         \
+} while (0)
+
+#define CHECK_KERNEL(KERNEL)                                \
+do {                                                        \
+  if (UNLIKELY(KERNEL == NULL)) {                           \
+    err = CL_INVALID_KERNEL;                                \
+    goto error;                                             \
+  }                                                         \
+  if (UNLIKELY(KERNEL->magic != CL_MAGIC_KERNEL_HEADER)) {  \
+    err = CL_INVALID_KERNEL;                                \
+    goto error;                                             \
+  }                                                         \
+} while (0)
+
+#define CHECK_PROGRAM(PROGRAM)                              \
+do {                                                        \
+  if (UNLIKELY(PROGRAM == NULL)) {                          \
+    err = CL_INVALID_PROGRAM;                               \
+    goto error;                                             \
+  }                                                         \
+  if (UNLIKELY(PROGRAM->magic != CL_MAGIC_PROGRAM_HEADER)) {\
+    err = CL_INVALID_PROGRAM;                               \
+    goto error;                                             \
+  }                                                         \
+} while (0)
+
+#define ELEMENTS(x) (sizeof(x)/sizeof(*(x)))
+#define CALLOC_STRUCT(T) (struct T*) cl_calloc(1, sizeof(struct T))
+#define CALLOC(T) (T*) cl_calloc(1, sizeof(T))
+#define CALLOC_ARRAY(T, N) (T*) cl_calloc(N, sizeof(T))
+#define MEMZERO(x) do { memset((x),0,sizeof(*(x))); } while (0)
+
+/* Run some code and catch errors */
+#define TRY(fn,...)                                     \
+do {                                                    \
+  if (UNLIKELY((err = fn(__VA_ARGS__)) != CL_SUCCESS))  \
+    goto error;                                         \
+} while (0)
+
+#define TRY_NO_ERR(fn,...)                              \
+do {                                                    \
+  if (UNLIKELY(fn(__VA_ARGS__) != CL_SUCCESS))          \
+    goto error;                                         \
+} while (0)
+
+#define TRY_ALLOC(dst, EXPR)                            \
+do {                                                    \
+  if (UNLIKELY((dst = EXPR) == NULL))                   \
+    DO_ALLOC_ERROR;                                     \
+} while (0)
+
+#define TRY_ALLOC_NO_ERR(dst, EXPR)                     \
+do {                                                    \
+  if (UNLIKELY((dst = EXPR) == NULL))                   \
+    goto error;                                         \
+} while (0)
+
+#define TRY_ALLOC_NO_RET(EXPR)                          \
+do {                                                    \
+  if (UNLIKELY((EXPR) == NULL))                         \
+    DO_ALLOC_ERROR;                                     \
+} while (0)
+
+/* Break Point Definitions */
+#if !defined(NDEBUG)
+
+#define BREAK                                           \
+do {                                                    \
+  __asm__("int3");                                      \
+} while(0)
+
+#define BREAK_IF(value)                                 \
+do {                                                    \
+  if (UNLIKELY(!(value))) BREAKPOINT();                 \
+} while(0)
+
+#else
+#define BREAKPOINT() do { } while(0)
+#define ASSERT(value) do { } while(0)
+#endif
+
+/* For all internal functions */
+#define LOCAL __attribute__ ((visibility ("internal")))
+
+/* Align a structure or a variable */
+#define ALIGNED(X) __attribute__ ((aligned (X)))
+
+/* Number of DWORDS */
+#define SIZEOF32(X) (sizeof(X) / sizeof(uint32_t))
+
+/* Memory quantity */
+#define KB 1024
+#define MB (KB*KB)
+
+/* To help bitfield definitions */
+#define BITFIELD_BIT(X) 1
+#define BITFIELD_RANGE(X,Y) ((Y) - (X) + 1)
+
+/* 32 bits atomic variable */
+typedef volatile int atomic_t;
+
+static INLINE int atomic_add(atomic_t *v, const int c) {
+  register int i = c;
+  __asm__ __volatile__("lock ; xaddl %0, %1;"
+      : "+r"(i), "+m"(*v)
+      : "m"(*v), "r"(i));
+  return i;
+}
+
+static INLINE int atomic_inc(atomic_t *v) { return atomic_add(v, 1); }
+static INLINE int atomic_dec(atomic_t *v) { return atomic_add(v, -1); }
+
+#endif /* __CL_UTILS_H__ */
+
diff --git a/src/intel/intel_batchbuffer.c b/src/intel/intel_batchbuffer.c
new file mode 100644
index 0000000..d3da3cc
--- /dev/null
+++ b/src/intel/intel_batchbuffer.c
@@ -0,0 +1,191 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "intel/intel_batchbuffer.h"
+#include "intel/intel_driver.h"
+#include "cl_alloc.h"
+#include "cl_utils.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+LOCAL int
+intel_batchbuffer_reset(intel_batchbuffer_t *batch, size_t sz)
+{
+  if (batch->buffer != NULL) {
+    dri_bo_unreference(batch->buffer);
+    batch->buffer = NULL;
+    batch->last_bo = NULL;
+  }
+
+  batch->buffer = dri_bo_alloc(batch->intel->bufmgr,
+                               "batch buffer",
+                               sz,
+                               64);
+  if (!batch->buffer || (dri_bo_map(batch->buffer, 1) != 0)) {
+    if (batch->buffer)
+      dri_bo_unreference(batch->buffer);
+    batch->buffer = NULL;
+    return -1;
+  }
+  batch->map = (uint8_t*) batch->buffer->virtual;
+  batch->size = sz;
+  batch->ptr = batch->map;
+  batch->atomic = 0;
+  batch->last_bo = batch->buffer;
+  batch->enable_slm = 0;
+  return 0;
+}
+
+LOCAL void
+intel_batchbuffer_init(intel_batchbuffer_t *batch, intel_driver_t *intel)
+{
+  assert(intel);
+  batch->intel = intel;
+}
+
+LOCAL void
+intel_batchbuffer_terminate(intel_batchbuffer_t *batch)
+{
+  assert(batch->buffer);
+
+  if (batch->map) {
+    dri_bo_unmap(batch->buffer);
+    batch->map = NULL;
+  }
+
+  dri_bo_unreference(batch->buffer);
+  batch->buffer = NULL;
+}
+
+LOCAL void
+intel_batchbuffer_flush(intel_batchbuffer_t *batch)
+{
+  uint32_t used = batch->ptr - batch->map;
+  int is_locked = batch->intel->locked;
+
+  if (used == 0)
+    return;
+
+  if ((used & 4) == 0) {
+    *(uint32_t*) batch->ptr = 0;
+    batch->ptr += 4;
+  }
+
+  *(uint32_t*)batch->ptr = MI_BATCH_BUFFER_END;
+  batch->ptr += 4;
+  dri_bo_unmap(batch->buffer);
+  used = batch->ptr - batch->map;
+
+  if (!is_locked)
+    intel_driver_lock_hardware(batch->intel);
+
+  int flag = I915_EXEC_RENDER;
+  if(batch->enable_slm) {
+    /* use the hard code here temp, must change to
+     * I915_EXEC_ENABLE_SLM when it drm accept the patch */
+    flag |= (1<<13);
+  }
+  drm_intel_gem_bo_context_exec(batch->buffer, batch->intel->ctx, used, flag);
+
+  if (!is_locked)
+    intel_driver_unlock_hardware(batch->intel);
+
+  // Release the buffer
+  intel_batchbuffer_terminate(batch);
+}
+
+LOCAL void 
+intel_batchbuffer_emit_reloc(intel_batchbuffer_t *batch,
+                             dri_bo *bo, 
+                             uint32_t read_domains,
+                             uint32_t write_domains, 
+                             uint32_t delta)
+{
+  assert(batch->ptr - batch->map < batch->size);
+  dri_bo_emit_reloc(batch->buffer,
+                    read_domains,
+                    write_domains,
+                    delta,
+                    batch->ptr - batch->map,
+                    bo);
+  intel_batchbuffer_emit_dword(batch, bo->offset + delta);
+}
+
+LOCAL void
+intel_batchbuffer_emit_mi_flush(intel_batchbuffer_t *batch)
+{
+  intel_batchbuffer_require_space(batch, 4);
+  intel_batchbuffer_emit_dword(batch, MI_FLUSH | STATE_INSTRUCTION_CACHE_INVALIDATE);
+}
+
+LOCAL intel_batchbuffer_t*
+intel_batchbuffer_new(intel_driver_t *intel)
+{
+  intel_batchbuffer_t *batch = NULL;
+  assert(intel);
+  TRY_ALLOC_NO_ERR (batch, CALLOC(intel_batchbuffer_t));
+  intel_batchbuffer_init(batch, intel);
+
+exit:
+  return batch;
+error:
+  intel_batchbuffer_delete(batch);
+  batch = NULL;
+  goto exit;
+}
+
+LOCAL void
+intel_batchbuffer_delete(intel_batchbuffer_t *batch)
+{
+  if (batch == NULL)
+    return;
+  if(batch->buffer)
+    intel_batchbuffer_terminate(batch);
+
+  cl_free(batch);
+}
diff --git a/src/intel/intel_batchbuffer.h b/src/intel/intel_batchbuffer.h
new file mode 100644
index 0000000..4c28a7c
--- /dev/null
+++ b/src/intel/intel_batchbuffer.h
@@ -0,0 +1,152 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+#ifndef _INTEL_BATCHBUFFER_H_
+#define _INTEL_BATCHBUFFER_H_
+
+#include "intel_defines.h"
+#include "cl_utils.h"
+
+#include <xf86drm.h>
+#include <drm.h>
+#include <i915_drm.h>
+#include <intel_bufmgr.h>
+#include <stdint.h>
+#include <memory.h>
+#include <assert.h>
+
+#define BEGIN_BATCH(b, n) do {                                            \
+  intel_batchbuffer_require_space(b, (n) * 4);                            \
+} while (0)
+
+#define OUT_BATCH(b, d) do {                                              \
+  intel_batchbuffer_emit_dword(b, d);                                     \
+} while (0)
+
+#define OUT_RELOC(b, bo, read_domains, write_domain, delta) do {          \
+  assert((delta) >= 0);                                                   \
+  intel_batchbuffer_emit_reloc(b, bo, read_domains, write_domain, delta); \
+} while (0)
+
+#define ADVANCE_BATCH(b) do { } while (0)
+
+struct intel_driver;
+
+typedef struct intel_batchbuffer
+{
+  struct intel_driver *intel;
+  drm_intel_bo *buffer;
+  /** Last bo submitted to the hardware.  used for clFinish. */
+  drm_intel_bo *last_bo;
+  uint32_t size;
+  uint8_t *map;
+  uint8_t *ptr;
+  /** HSW: can't set LRI in batch buffer, set I915_EXEC_ENABLE_SLM
+   *  flag when call exec. */
+  uint8_t enable_slm;
+  int atomic;
+} intel_batchbuffer_t;
+
+extern intel_batchbuffer_t* intel_batchbuffer_new(struct intel_driver*);
+extern void intel_batchbuffer_delete(intel_batchbuffer_t*);
+extern void intel_batchbuffer_emit_reloc(intel_batchbuffer_t*,
+                                         drm_intel_bo*,
+                                         uint32_t read_domains,
+                                         uint32_t write_domains,
+                                         uint32_t delta);
+extern void intel_batchbuffer_emit_mi_flush(intel_batchbuffer_t*);
+extern void intel_batchbuffer_init(intel_batchbuffer_t*, struct intel_driver*);
+extern void intel_batchbuffer_terminate(intel_batchbuffer_t*);
+extern void intel_batchbuffer_flush(intel_batchbuffer_t*);
+extern int intel_batchbuffer_reset(intel_batchbuffer_t*, size_t sz);
+
+static INLINE uint32_t
+intel_batchbuffer_space(const intel_batchbuffer_t *batch)
+{
+  assert(batch->ptr);
+  return batch->size - (batch->ptr - batch->map);
+}
+
+static INLINE void
+intel_batchbuffer_emit_dword(intel_batchbuffer_t *batch, uint32_t x)
+{
+  assert(intel_batchbuffer_space(batch) >= 4);
+  *(uint32_t*)batch->ptr = x;
+  batch->ptr += 4;
+}
+
+static INLINE void
+intel_batchbuffer_require_space(intel_batchbuffer_t *batch, uint32_t size) {
+  assert(size < batch->size - 8);
+  if (intel_batchbuffer_space(batch) < size)
+    intel_batchbuffer_space(batch);
+}
+
+static INLINE uint8_t*
+intel_batchbuffer_alloc_space(intel_batchbuffer_t *batch, uint32_t size)
+{
+  assert(intel_batchbuffer_space(batch) >= size);
+  uint8_t *space_ptr = batch->ptr;
+  batch->ptr += size;
+  return space_ptr;
+}
+
+static INLINE void
+intel_batchbuffer_start_atomic(intel_batchbuffer_t *batch, uint32_t size)
+{
+  assert(!batch->atomic);
+  intel_batchbuffer_require_space(batch, size);
+  batch->atomic = 1;
+}
+
+static INLINE void
+intel_batchbuffer_end_atomic(intel_batchbuffer_t *batch)
+{
+  assert(batch->atomic);
+  batch->atomic = 0;
+}
+
+#endif /* _INTEL_BATCHBUFFER_H_ */
+
diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h
new file mode 100644
index 0000000..02ffde4
--- /dev/null
+++ b/src/intel/intel_defines.h
@@ -0,0 +1,339 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith at tungstengraphics.com>
+  */
+#ifndef __GENX_DEFINES_H__
+#define __GENX_DEFINES_H__
+
+#define CMD(PIPELINE,OP,SUB_OP) ((3 << 29) |          \
+                                ((PIPELINE) << 27) |  \
+                                ((OP) << 24) |        \
+                                ((SUB_OP) << 16))
+
+#define CMD_URB_FENCE                           CMD(0, 0, 0)
+#define CMD_CS_URB_STATE                        CMD(0, 0, 1)
+#define CMD_CONSTANT_BUFFER                     CMD(0, 0, 2)
+#define CMD_STATE_PREFETCH                      CMD(0, 0, 3)
+#define CMD_MEDIA_GATEWAY_STATE                 CMD(2, 0, 3)
+#define CMD_MEDIA_STATE_FLUSH                   CMD(2, 0, 4)
+#define CMD_GPGPU_WALKER                        CMD(2, 1, 5)
+#define CMD_PIPE_CONTROL                        CMD(3, 2, 0)
+
+#define CMD_LOAD_REGISTER_IMM                   (0x22 << 23)
+
+#define CMD_STATE_BASE_ADDRESS                  CMD(0, 1, 1)
+#define CMD_STATE_SIP                           CMD(0, 1, 2)
+#define CMD_PIPELINE_SELECT                     CMD(1, 1, 4)
+#define CMD_SAMPLER_PALETTE_LOAD                CMD(3, 1, 2)
+
+#define CMD_MEDIA_STATE_POINTERS                CMD(2, 0, 0)
+#define CMD_MEDIA                               CMD(2, 1, 0)
+#define CMD_MEDIA_EX                            CMD(2, 1, 1)
+
+#define CMD_PIPELINED_POINTERS                  CMD(3, 0, 0)
+#define CMD_BINDING_TABLE_POINTERS              CMD(3, 0, 1)
+#define CMD_VERTEX_BUFFERS                      CMD(3, 0, 8)
+#define CMD_VERTEX_ELEMENTS                     CMD(3, 0, 9)
+#define CMD_DRAWING_RECTANGLE                   CMD(3, 1, 0)
+#define CMD_CONSTANT_COLOR                      CMD(3, 1, 1)
+#define CMD_3DPRIMITIVE                         CMD(3, 3, 0)
+
+#define BASE_ADDRESS_MODIFY             (1 << 0)
+
+#define PIPELINE_SELECT_3D              0
+#define PIPELINE_SELECT_MEDIA           1
+
+#define UF0_CS_REALLOC                  (1 << 13)
+#define UF0_VFE_REALLOC                 (1 << 12)
+#define UF0_SF_REALLOC                  (1 << 11)
+#define UF0_CLIP_REALLOC                (1 << 10)
+#define UF0_GS_REALLOC                  (1 << 9)
+#define UF0_VS_REALLOC                  (1 << 8)
+#define UF1_CLIP_FENCE_SHIFT            20
+#define UF1_GS_FENCE_SHIFT              10
+#define UF1_VS_FENCE_SHIFT              0
+#define UF2_CS_FENCE_SHIFT              20
+#define UF2_VFE_FENCE_SHIFT             10
+#define UF2_SF_FENCE_SHIFT              0
+
+#define FLOATING_POINT_IEEE_754        0
+#define FLOATING_POINT_NON_IEEE_754    1
+
+#define I965_SURFACE_1D      0
+#define I965_SURFACE_2D      1
+#define I965_SURFACE_3D      2
+#define I965_SURFACE_CUBE    3
+#define I965_SURFACE_BUFFER  4
+#define I965_SURFACE_NULL    7
+
+#define I965_SURFACEFORMAT_R32G32B32A32_FLOAT             0x000 
+#define I965_SURFACEFORMAT_R32G32B32A32_SINT              0x001 
+#define I965_SURFACEFORMAT_R32G32B32A32_UINT              0x002 
+#define I965_SURFACEFORMAT_R32G32B32A32_UNORM             0x003 
+#define I965_SURFACEFORMAT_R32G32B32A32_SNORM             0x004 
+#define I965_SURFACEFORMAT_R64G64_FLOAT                   0x005 
+#define I965_SURFACEFORMAT_R32G32B32X32_FLOAT             0x006 
+#define I965_SURFACEFORMAT_R32G32B32A32_SSCALED           0x007
+#define I965_SURFACEFORMAT_R32G32B32A32_USCALED           0x008
+#define I965_SURFACEFORMAT_R32G32B32_FLOAT                0x040 
+#define I965_SURFACEFORMAT_R32G32B32_SINT                 0x041 
+#define I965_SURFACEFORMAT_R32G32B32_UINT                 0x042 
+#define I965_SURFACEFORMAT_R32G32B32_UNORM                0x043 
+#define I965_SURFACEFORMAT_R32G32B32_SNORM                0x044 
+#define I965_SURFACEFORMAT_R32G32B32_SSCALED              0x045 
+#define I965_SURFACEFORMAT_R32G32B32_USCALED              0x046 
+#define I965_SURFACEFORMAT_R16G16B16A16_UNORM             0x080 
+#define I965_SURFACEFORMAT_R16G16B16A16_SNORM             0x081 
+#define I965_SURFACEFORMAT_R16G16B16A16_SINT              0x082 
+#define I965_SURFACEFORMAT_R16G16B16A16_UINT              0x083 
+#define I965_SURFACEFORMAT_R16G16B16A16_FLOAT             0x084 
+#define I965_SURFACEFORMAT_R32G32_FLOAT                   0x085 
+#define I965_SURFACEFORMAT_R32G32_SINT                    0x086 
+#define I965_SURFACEFORMAT_R32G32_UINT                    0x087 
+#define I965_SURFACEFORMAT_R32_FLOAT_X8X24_TYPELESS       0x088 
+#define I965_SURFACEFORMAT_X32_TYPELESS_G8X24_UINT        0x089 
+#define I965_SURFACEFORMAT_L32A32_FLOAT                   0x08A 
+#define I965_SURFACEFORMAT_R32G32_UNORM                   0x08B 
+#define I965_SURFACEFORMAT_R32G32_SNORM                   0x08C 
+#define I965_SURFACEFORMAT_R64_FLOAT                      0x08D 
+#define I965_SURFACEFORMAT_R16G16B16X16_UNORM             0x08E 
+#define I965_SURFACEFORMAT_R16G16B16X16_FLOAT             0x08F 
+#define I965_SURFACEFORMAT_A32X32_FLOAT                   0x090 
+#define I965_SURFACEFORMAT_L32X32_FLOAT                   0x091 
+#define I965_SURFACEFORMAT_I32X32_FLOAT                   0x092 
+#define I965_SURFACEFORMAT_R16G16B16A16_SSCALED           0x093
+#define I965_SURFACEFORMAT_R16G16B16A16_USCALED           0x094
+#define I965_SURFACEFORMAT_R32G32_SSCALED                 0x095
+#define I965_SURFACEFORMAT_R32G32_USCALED                 0x096
+#define I965_SURFACEFORMAT_B8G8R8A8_UNORM                 0x0C0 
+#define I965_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB            0x0C1 
+#define I965_SURFACEFORMAT_R10G10B10A2_UNORM              0x0C2 
+#define I965_SURFACEFORMAT_R10G10B10A2_UNORM_SRGB         0x0C3 
+#define I965_SURFACEFORMAT_R10G10B10A2_UINT               0x0C4 
+#define I965_SURFACEFORMAT_R10G10B10_SNORM_A2_UNORM       0x0C5 
+#define I965_SURFACEFORMAT_R8G8B8A8_UNORM                 0x0C7 
+#define I965_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB            0x0C8 
+#define I965_SURFACEFORMAT_R8G8B8A8_SNORM                 0x0C9 
+#define I965_SURFACEFORMAT_R8G8B8A8_SINT                  0x0CA 
+#define I965_SURFACEFORMAT_R8G8B8A8_UINT                  0x0CB 
+#define I965_SURFACEFORMAT_R16G16_UNORM                   0x0CC 
+#define I965_SURFACEFORMAT_R16G16_SNORM                   0x0CD 
+#define I965_SURFACEFORMAT_R16G16_SINT                    0x0CE 
+#define I965_SURFACEFORMAT_R16G16_UINT                    0x0CF 
+#define I965_SURFACEFORMAT_R16G16_FLOAT                   0x0D0 
+#define I965_SURFACEFORMAT_B10G10R10A2_UNORM              0x0D1 
+#define I965_SURFACEFORMAT_B10G10R10A2_UNORM_SRGB         0x0D2 
+#define I965_SURFACEFORMAT_R11G11B10_FLOAT                0x0D3 
+#define I965_SURFACEFORMAT_R32_SINT                       0x0D6 
+#define I965_SURFACEFORMAT_R32_UINT                       0x0D7 
+#define I965_SURFACEFORMAT_R32_FLOAT                      0x0D8 
+#define I965_SURFACEFORMAT_R24_UNORM_X8_TYPELESS          0x0D9 
+#define I965_SURFACEFORMAT_X24_TYPELESS_G8_UINT           0x0DA 
+#define I965_SURFACEFORMAT_L16A16_UNORM                   0x0DF 
+#define I965_SURFACEFORMAT_I24X8_UNORM                    0x0E0 
+#define I965_SURFACEFORMAT_L24X8_UNORM                    0x0E1 
+#define I965_SURFACEFORMAT_A24X8_UNORM                    0x0E2 
+#define I965_SURFACEFORMAT_I32_FLOAT                      0x0E3 
+#define I965_SURFACEFORMAT_L32_FLOAT                      0x0E4 
+#define I965_SURFACEFORMAT_A32_FLOAT                      0x0E5 
+#define I965_SURFACEFORMAT_B8G8R8X8_UNORM                 0x0E9 
+#define I965_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB            0x0EA 
+#define I965_SURFACEFORMAT_R8G8B8X8_UNORM                 0x0EB 
+#define I965_SURFACEFORMAT_R8G8B8X8_UNORM_SRGB            0x0EC 
+#define I965_SURFACEFORMAT_R9G9B9E5_SHAREDEXP             0x0ED 
+#define I965_SURFACEFORMAT_B10G10R10X2_UNORM              0x0EE 
+#define I965_SURFACEFORMAT_L16A16_FLOAT                   0x0F0 
+#define I965_SURFACEFORMAT_R32_UNORM                      0x0F1 
+#define I965_SURFACEFORMAT_R32_SNORM                      0x0F2 
+#define I965_SURFACEFORMAT_R10G10B10X2_USCALED            0x0F3
+#define I965_SURFACEFORMAT_R8G8B8A8_SSCALED               0x0F4
+#define I965_SURFACEFORMAT_R8G8B8A8_USCALED               0x0F5
+#define I965_SURFACEFORMAT_R16G16_SSCALED                 0x0F6
+#define I965_SURFACEFORMAT_R16G16_USCALED                 0x0F7
+#define I965_SURFACEFORMAT_R32_SSCALED                    0x0F8
+#define I965_SURFACEFORMAT_R32_USCALED                    0x0F9
+#define I965_SURFACEFORMAT_B5G6R5_UNORM                   0x100 
+#define I965_SURFACEFORMAT_B5G6R5_UNORM_SRGB              0x101 
+#define I965_SURFACEFORMAT_B5G5R5A1_UNORM                 0x102 
+#define I965_SURFACEFORMAT_B5G5R5A1_UNORM_SRGB            0x103 
+#define I965_SURFACEFORMAT_B4G4R4A4_UNORM                 0x104 
+#define I965_SURFACEFORMAT_B4G4R4A4_UNORM_SRGB            0x105 
+#define I965_SURFACEFORMAT_R8G8_UNORM                     0x106 
+#define I965_SURFACEFORMAT_R8G8_SNORM                     0x107 
+#define I965_SURFACEFORMAT_R8G8_SINT                      0x108 
+#define I965_SURFACEFORMAT_R8G8_UINT                      0x109 
+#define I965_SURFACEFORMAT_R16_UNORM                      0x10A 
+#define I965_SURFACEFORMAT_R16_SNORM                      0x10B 
+#define I965_SURFACEFORMAT_R16_SINT                       0x10C 
+#define I965_SURFACEFORMAT_R16_UINT                       0x10D 
+#define I965_SURFACEFORMAT_R16_FLOAT                      0x10E 
+#define I965_SURFACEFORMAT_I16_UNORM                      0x111 
+#define I965_SURFACEFORMAT_L16_UNORM                      0x112 
+#define I965_SURFACEFORMAT_A16_UNORM                      0x113 
+#define I965_SURFACEFORMAT_L8A8_UNORM                     0x114 
+#define I965_SURFACEFORMAT_I16_FLOAT                      0x115
+#define I965_SURFACEFORMAT_L16_FLOAT                      0x116
+#define I965_SURFACEFORMAT_A16_FLOAT                      0x117 
+#define I965_SURFACEFORMAT_R5G5_SNORM_B6_UNORM            0x119 
+#define I965_SURFACEFORMAT_B5G5R5X1_UNORM                 0x11A 
+#define I965_SURFACEFORMAT_B5G5R5X1_UNORM_SRGB            0x11B
+#define I965_SURFACEFORMAT_R8G8_SSCALED                   0x11C
+#define I965_SURFACEFORMAT_R8G8_USCALED                   0x11D
+#define I965_SURFACEFORMAT_R16_SSCALED                    0x11E
+#define I965_SURFACEFORMAT_R16_USCALED                    0x11F
+#define I965_SURFACEFORMAT_R8_UNORM                       0x140 
+#define I965_SURFACEFORMAT_R8_SNORM                       0x141 
+#define I965_SURFACEFORMAT_R8_SINT                        0x142 
+#define I965_SURFACEFORMAT_R8_UINT                        0x143 
+#define I965_SURFACEFORMAT_A8_UNORM                       0x144 
+#define I965_SURFACEFORMAT_I8_UNORM                       0x145 
+#define I965_SURFACEFORMAT_L8_UNORM                       0x146 
+#define I965_SURFACEFORMAT_P4A4_UNORM                     0x147 
+#define I965_SURFACEFORMAT_A4P4_UNORM                     0x148
+#define I965_SURFACEFORMAT_R8_SSCALED                     0x149
+#define I965_SURFACEFORMAT_R8_USCALED                     0x14A
+#define I965_SURFACEFORMAT_R1_UINT                        0x181 
+#define I965_SURFACEFORMAT_YCRCB_NORMAL                   0x182 
+#define I965_SURFACEFORMAT_YCRCB_SWAPUVY                  0x183 
+#define I965_SURFACEFORMAT_BC1_UNORM                      0x186 
+#define I965_SURFACEFORMAT_BC2_UNORM                      0x187 
+#define I965_SURFACEFORMAT_BC3_UNORM                      0x188 
+#define I965_SURFACEFORMAT_BC4_UNORM                      0x189 
+#define I965_SURFACEFORMAT_BC5_UNORM                      0x18A 
+#define I965_SURFACEFORMAT_BC1_UNORM_SRGB                 0x18B 
+#define I965_SURFACEFORMAT_BC2_UNORM_SRGB                 0x18C 
+#define I965_SURFACEFORMAT_BC3_UNORM_SRGB                 0x18D 
+#define I965_SURFACEFORMAT_MONO8                          0x18E 
+#define I965_SURFACEFORMAT_YCRCB_SWAPUV                   0x18F 
+#define I965_SURFACEFORMAT_YCRCB_SWAPY                    0x190 
+#define I965_SURFACEFORMAT_DXT1_RGB                       0x191 
+#define I965_SURFACEFORMAT_FXT1                           0x192 
+#define I965_SURFACEFORMAT_R8G8B8_UNORM                   0x193 
+#define I965_SURFACEFORMAT_R8G8B8_SNORM                   0x194 
+#define I965_SURFACEFORMAT_R8G8B8_SSCALED                 0x195 
+#define I965_SURFACEFORMAT_R8G8B8_USCALED                 0x196 
+#define I965_SURFACEFORMAT_R64G64B64A64_FLOAT             0x197 
+#define I965_SURFACEFORMAT_R64G64B64_FLOAT                0x198 
+#define I965_SURFACEFORMAT_BC4_SNORM                      0x199 
+#define I965_SURFACEFORMAT_BC5_SNORM                      0x19A 
+#define I965_SURFACEFORMAT_R16G16B16_UNORM                0x19C 
+#define I965_SURFACEFORMAT_R16G16B16_SNORM                0x19D 
+#define I965_SURFACEFORMAT_R16G16B16_SSCALED              0x19E 
+#define I965_SURFACEFORMAT_R16G16B16_USCALED              0x19F
+#define I965_SURFACEFORMAT_RAW                            0x1FF
+
+#define I965_MAPFILTER_NEAREST        0x0 
+#define I965_MAPFILTER_LINEAR         0x1 
+#define I965_MAPFILTER_ANISOTROPIC    0x2
+
+#define I965_MIPFILTER_NONE        0
+#define I965_MIPFILTER_NEAREST     1
+#define I965_MIPFILTER_LINEAR      3
+
+#define I965_TEXCOORDMODE_WRAP            0
+#define I965_TEXCOORDMODE_MIRROR          1
+#define I965_TEXCOORDMODE_CLAMP           2
+#define I965_TEXCOORDMODE_CUBE            3
+#define I965_TEXCOORDMODE_CLAMP_BORDER    4
+#define I965_TEXCOORDMODE_MIRROR_ONCE     5
+
+#define I965_SURFACERETURNFORMAT_FLOAT32  0
+#define I965_SURFACERETURNFORMAT_S1       1
+
+#define I965_TILEWALK_XMAJOR                 0
+#define I965_TILEWALK_YMAJOR                 1
+
+#define I965_SURCHAN_SELECT_ZERO             0
+#define I965_SURCHAN_SELECT_ONE              1
+#define I965_SURCHAN_SELECT_RED              4
+#define I965_SURCHAN_SELECT_GREEN            5
+#define I965_SURCHAN_SELECT_BLUE             6
+#define I965_SURCHAN_SELECT_ALPHA            7
+
+#define URB_SIZE(intel)         (IS_IGDNG(intel->device_id) ? 1024 : \
+                                 IS_G4X(intel->device_id) ? 384 : 256)
+
+// L3 cache stuff 
+#define GEN7_L3_SQC_REG1_ADDRESS_OFFSET          (0XB010)
+#define GEN7_L3_CNTL_REG2_ADDRESS_OFFSET         (0xB020)
+#define GEN7_L3_CNTL_REG3_ADDRESS_OFFSET         (0xB024)
+
+// To issue pipe controls (reset L3 / SLM or stall)
+#define GEN7_PIPE_CONTROL_MEDIA 0x2
+#define GEN7_PIPE_CONTROL_3D 0x3
+#define GEN7_PIPE_CONTROL_INSTRUCTION_GFX 0x3
+#define GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL 0x2
+#define GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL 0x0
+#define GEN7_PIPE_CONTROL_WRITE_TIMESTAMP        (3 << 14)
+#define GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE       (1 << 2)
+
+
+#define GEN_MAPFILTER_NEAREST        0x0
+#define GEN_MAPFILTER_LINEAR         0x1
+#define GEN_MAPFILTER_ANISOTROPIC    0x2
+
+#define GEN_MIPFILTER_NONE        0
+#define GEN_MIPFILTER_NEAREST     1
+#define GEN_MIPFILTER_LINEAR      3
+
+#define GEN_ADDRESS_ROUNDING_ENABLE_U_MAG	0x20
+#define GEN_ADDRESS_ROUNDING_ENABLE_U_MIN	0x10
+#define GEN_ADDRESS_ROUNDING_ENABLE_V_MAG	0x08
+#define GEN_ADDRESS_ROUNDING_ENABLE_V_MIN	0x04
+#define GEN_ADDRESS_ROUNDING_ENABLE_R_MAG	0x02
+#define GEN_ADDRESS_ROUNDING_ENABLE_R_MIN	0x01
+
+#define GEN_TEXCOORDMODE_WRAP            0
+#define GEN_TEXCOORDMODE_MIRROR          1
+#define GEN_TEXCOORDMODE_CLAMP           2
+#define GEN_TEXCOORDMODE_CUBE            3
+#define GEN_TEXCOORDMODE_CLAMP_BORDER    4
+#define GEN_TEXCOORDMODE_MIRROR_ONCE     5
+
+#endif /* __GENX_DEFINES_H__ */
+
diff --git a/src/intel/intel_dri_resource_sharing.c b/src/intel/intel_dri_resource_sharing.c
new file mode 100644
index 0000000..188c1fa
--- /dev/null
+++ b/src/intel/intel_dri_resource_sharing.c
@@ -0,0 +1,208 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#define HAVE_PTHREAD 1
+#include <errno.h>
+#include <time.h>
+#include "main/context.h"
+#include "main/renderbuffer.h"
+#include "main/texobj.h"
+#include <stdbool.h>
+#include <string.h>
+#include <drm.h>
+#include <i915_drm.h>
+#include <intel_bufmgr.h>
+#include <GL/internal/dri_interface.h>
+#include "intel_mipmap_tree.h"
+#include "intel_regions.h"
+#include "intel_context.h"
+
+#include "intel_dri_resource_sharing.h"
+#include "intel_dri_resource_sharing_int.h"
+
+#include <dlfcn.h>
+/**
+ * Sets up a DRIImage structure to point to our shared image in a region
+ */
+static bool
+intel_setup_cl_region_from_mipmap_tree(void *driver,
+                                       struct intel_context *intel,
+                                       struct intel_mipmap_tree *mt,
+                                       GLuint level, GLuint zoffset,
+                                       struct _intel_dri_share_image_region *region)
+{
+   unsigned int draw_x, draw_y;
+   uint32_t mask_x, mask_y;
+   struct intel_region *null_region = (struct intel_region *)NULL;
+
+   intel_miptree_check_level_layer(mt, level, zoffset);
+
+   _intel_region_get_tile_masks(mt->region, &mask_x, &mask_y, false);
+   _intel_miptree_get_image_offset(mt, level, zoffset, &draw_x, &draw_y);
+
+   region->w = mt->level[level].width;
+   region->h = mt->level[level].height;
+   region->tile_x = draw_x & mask_x;
+   region->tile_y = draw_y & mask_y;
+   region->tiling = mt->region->tiling;
+   /* XXX hard code to 1 right now. */
+   region->depth = 1;
+   region->row_pitch = mt->region->pitch;
+
+   region->offset = _intel_region_get_aligned_offset(mt->region,
+                                                     draw_x & ~mask_x,
+                                                     draw_y & ~mask_y,
+                                                     false);
+   if (!_intel_region_flink(mt->region, &region->name))
+      return false;
+   _intel_region_reference(&null_region, mt->region);
+   return true;
+}
+
+typedef void
+_mesa_test_texobj_completeness_t( const struct gl_context *ctx,
+                                struct gl_texture_object *t );
+_mesa_test_texobj_completeness_t *__mesa_test_texobj_completeness;
+
+typedef struct gl_texture_object *
+_mesa_lookup_texture_t( const struct gl_context *ctx, GLuint id);
+_mesa_lookup_texture_t *__mesa_lookup_texture;
+
+static struct gl_texture_object *
+intel_get_gl_obj_from_texture(void *driver,
+                              struct intel_context *intel,
+                              GLenum target, GLint level,
+                              GLuint texture, GLuint face)
+{
+   struct gl_texture_object *obj;
+   __mesa_lookup_texture = dlsym(driver, "_mesa_lookup_texture");
+   obj = __mesa_lookup_texture(&intel->ctx, texture);
+   if (!obj || obj->Target != target) {
+      return NULL;
+   }
+
+   __mesa_test_texobj_completeness = dlsym(driver, "_mesa_test_texobj_completeness");
+   __mesa_test_texobj_completeness(&intel->ctx, obj);
+   if (!obj->_BaseComplete || (level > 0 && !obj->_MipmapComplete)) {
+      return NULL;
+   }
+
+   if (level < obj->BaseLevel || level > obj->_MaxLevel) {
+      return NULL;
+   }
+
+   return obj;
+}
+
+static GLenum
+get_cl_gl_format(mesa_format format)
+{
+   switch (format) {
+   case MESA_FORMAT_R8G8B8A8_UNORM:
+      return GL_RGBA;
+   case MESA_FORMAT_A8R8G8B8_UNORM:
+      return GL_BGRA;
+   default:
+      return GL_BGRA;
+  }
+}
+
+static bool
+intelAcquireTexture(void *driver, __DRIcontext *context, GLenum target,
+                    GLint level, GLuint texture, void *user_data)
+{
+   struct _intel_dri_share_image_region *region = intel_dri_share_image_region(user_data);
+   struct intel_context *intel = context->driverPrivate;
+   struct gl_texture_object *obj;
+   struct intel_texture_object *iobj;
+   /* XXX Always be face 0? */
+   GLuint face = 0;
+
+   obj = intel_get_gl_obj_from_texture(driver, intel, target, level, texture, face);
+   if (obj == NULL)
+     return false;
+   iobj = intel_texture_object(obj);
+   region->gl_format = get_cl_gl_format(obj->Image[face][level]->TexFormat);
+   return intel_setup_cl_region_from_mipmap_tree(driver, intel, iobj->mt, level, 0, region);
+}
+
+static bool
+intelReleaseTexture(void *driver, __DRIcontext *context, GLenum target,
+                    GLint level, GLuint texture)
+{
+   struct intel_context *intel = context->driverPrivate;
+   struct gl_texture_object *obj;
+   struct intel_texture_object *iobj;
+   /* XXX Always be face 0? */
+   GLuint face = 0;
+
+   obj = intel_get_gl_obj_from_texture(driver, intel, target, level, texture, face);
+   if (obj == NULL)
+     return false;
+
+   iobj = intel_texture_object(obj);
+   _intel_region_release(&iobj->mt->region);
+   return true;
+}
+
+static bool
+intelAcquireBufferObj(void *driver, __DRIcontext *driContextPriv,
+                      GLuint bufobj, void *user_data)
+{
+  return false;
+}
+
+static bool
+intelReleaseBufferObj(void *driver, __DRIcontext *driContextPriv, GLuint bufobj)
+{
+  return false;
+}
+
+static bool
+intelAcquireRenderBuffer(void *driver, __DRIcontext *driContextPriv,
+                         GLuint bufobj, void *user_data)
+{
+  return false;
+}
+
+static bool
+intelReleaseRenderBuffer(void *driver, __DRIcontext *driContextPriv, GLuint bufobj)
+{
+  return false;
+}
+
+#include "cl_driver.h"
+void
+intel_set_cl_gl_callbacks(void)
+{
+  cl_gl_acquire_texture = (cl_gl_acquire_texture_cb*)intelAcquireTexture;
+  cl_gl_release_texture = (cl_gl_release_texture_cb*)intelReleaseTexture;
+  cl_gl_acquire_buffer_object = (cl_gl_acquire_buffer_object_cb*)intelAcquireBufferObj;
+  cl_gl_release_buffer_object = (cl_gl_release_buffer_object_cb*)intelReleaseBufferObj;
+  cl_gl_acquire_render_buffer = (cl_gl_acquire_render_buffer_cb*)intelAcquireRenderBuffer;
+  cl_gl_release_render_buffer = (cl_gl_release_render_buffer_cb*)intelReleaseRenderBuffer;
+}
diff --git a/src/intel/intel_dri_resource_sharing.h b/src/intel/intel_dri_resource_sharing.h
new file mode 100644
index 0000000..6d2ce4d
--- /dev/null
+++ b/src/intel/intel_dri_resource_sharing.h
@@ -0,0 +1,39 @@
+#ifndef __INTEL_DRI_RESOURCE_SHARING_H__
+#define __INTEL_DRI_RESOURCE_SHARING_H__
+
+struct _intel_dri_share_image_region {
+  unsigned int name;
+  size_t w;
+  size_t h;
+  size_t depth;
+  size_t pitch;
+  int tiling;
+  size_t offset;
+  size_t tile_x;
+  size_t tile_y;
+  unsigned int gl_format;
+  size_t row_pitch, slice_pitch;
+};
+
+struct _intel_dri_share_buffer_object {
+  unsigned int name;
+  size_t sz;
+  size_t offset;
+};
+
+inline static struct _intel_dri_share_image_region *
+intel_dri_share_image_region(void *user_data)
+{
+   return (struct _intel_dri_share_image_region *)user_data;
+}
+
+inline static struct _intel_dri_share_buffer_object *
+intel_dri_share_buffer_object(void *user_data)
+{
+   return (struct _intel_dri_share_buffer_object *)user_data;
+}
+
+extern void intel_set_cl_gl_callbacks(void);
+
+
+#endif
diff --git a/src/intel/intel_dri_resource_sharing_int.h b/src/intel/intel_dri_resource_sharing_int.h
new file mode 100644
index 0000000..c7b283a
--- /dev/null
+++ b/src/intel/intel_dri_resource_sharing_int.h
@@ -0,0 +1,143 @@
+/*****************************************************************
+ * The following functions are copied from i965 driver, commit
+ * id 292368570a13501dfa95b1b0dd70966caf6ffc6b. Need to keep consistant
+ * with the dri driver installed on current system.
+ *****************************************************************/
+static bool
+_intel_region_flink(struct intel_region *region, uint32_t *name)
+{
+   if (region->name == 0) {
+      if (drm_intel_bo_flink(region->bo, &region->name))
+         return false;
+   }
+
+   *name = region->name;
+
+   return true;
+}
+
+#define _DBG(...)
+static void
+_intel_region_release(struct intel_region **region_handle)
+{
+   struct intel_region *region = *region_handle;
+
+   if (region == NULL) {
+      _DBG("%s NULL\n", __FUNCTION__);
+      return;
+   }
+
+   _DBG("%s %p %d\n", __FUNCTION__, region, region->refcount - 1);
+
+   ASSERT(region->refcount > 0);
+   region->refcount--;
+
+   if (region->refcount == 0) {
+      drm_intel_bo_unreference(region->bo);
+
+      free(region);
+   }
+   *region_handle = NULL;
+}
+
+static void
+_intel_region_reference(struct intel_region **dst, struct intel_region *src)
+{
+   _DBG("%s: %p(%d) -> %p(%d)\n", __FUNCTION__,
+        *dst, *dst ? (*dst)->refcount : 0, src, src ? src->refcount : 0);
+
+   if (src != *dst) {
+      if (*dst)
+         _intel_region_release(dst);
+
+      if (src)
+         src->refcount++;
+      *dst = src;
+   }
+}
+
+/**
+ * This function computes masks that may be used to select the bits of the X
+ * and Y coordinates that indicate the offset within a tile.  If the region is
+ * untiled, the masks are set to 0.
+ */
+static void
+_intel_region_get_tile_masks(struct intel_region *region,
+                             uint32_t *mask_x, uint32_t *mask_y,
+                             bool map_stencil_as_y_tiled)
+{
+   int cpp = region->cpp;
+   uint32_t tiling = region->tiling;
+
+   if (map_stencil_as_y_tiled)
+      tiling = I915_TILING_Y;
+
+   switch (tiling) {
+   default:
+      assert(false);
+   case I915_TILING_NONE:
+      *mask_x = *mask_y = 0;
+      break;
+   case I915_TILING_X:
+      *mask_x = 512 / cpp - 1;
+      *mask_y = 7;
+      break;
+   case I915_TILING_Y:
+      *mask_x = 128 / cpp - 1;
+      *mask_y = 31;
+      break;
+   }
+}
+
+/**
+ * Compute the offset (in bytes) from the start of the region to the given x
+ * and y coordinate.  For tiled regions, caller must ensure that x and y are
+ * multiples of the tile size.
+ */
+static uint32_t
+_intel_region_get_aligned_offset(struct intel_region *region, uint32_t x,
+                                 uint32_t y, bool map_stencil_as_y_tiled)
+{
+   int cpp = region->cpp;
+   uint32_t pitch = region->pitch;
+   uint32_t tiling = region->tiling;
+
+   if (map_stencil_as_y_tiled) {
+      tiling = I915_TILING_Y;
+
+      /* When mapping a W-tiled stencil buffer as Y-tiled, each 64-high W-tile
+       * gets transformed into a 32-high Y-tile.  Accordingly, the pitch of
+       * the resulting region is twice the pitch of the original region, since
+       * each row in the Y-tiled view corresponds to two rows in the actual
+       * W-tiled surface.  So we need to correct the pitch before computing
+       * the offsets.
+       */
+      pitch *= 2;
+   }
+
+   switch (tiling) {
+   default:
+      assert(false);
+   case I915_TILING_NONE:
+      return y * pitch + x * cpp;
+   case I915_TILING_X:
+      assert((x % (512 / cpp)) == 0);
+      assert((y % 8) == 0);
+      return y * pitch + x / (512 / cpp) * 4096;
+   case I915_TILING_Y:
+      assert((x % (128 / cpp)) == 0);
+      assert((y % 32) == 0);
+      return y * pitch + x / (128 / cpp) * 4096;
+   }
+}
+
+static void
+_intel_miptree_get_image_offset(struct intel_mipmap_tree *mt,
+                                GLuint level, GLuint slice,
+                                GLuint *x, GLuint *y)
+{
+   assert(slice < mt->level[level].depth);
+
+   *x = mt->level[level].slice[slice].x_offset;
+   *y = mt->level[level].slice[slice].y_offset;
+}
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
new file mode 100644
index 0000000..deb83c8
--- /dev/null
+++ b/src/intel/intel_driver.c
@@ -0,0 +1,744 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Xiang Haihao <haihao.xiang at intel.com>
+ *    Zou Nan hai <nanhai.zou at intel.com>
+ *
+ */
+
+#if defined(HAS_EGL)
+#include "GL/gl.h"
+#include "EGL/egl.h"
+#include "x11/mesa_egl_extension.h"
+#endif
+
+#ifdef HAS_X11
+#include <X11/Xlibint.h>
+#include "x11/dricommon.h"
+#endif
+
+#include "intel_driver.h"
+#include "intel_gpgpu.h"
+#include "intel_batchbuffer.h"
+#include "intel_bufmgr.h"
+#include "cl_mem.h"
+
+#include <assert.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <xf86drm.h>
+#include <stdio.h>
+
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_context.h"
+#include "cl_driver.h"
+#include "cl_device_id.h"
+#include "cl_platform_id.h"
+
+#define SET_BLOCKED_SIGSET(DRIVER)   do {                     \
+  sigset_t bl_mask;                                           \
+  sigfillset(&bl_mask);                                       \
+  sigdelset(&bl_mask, SIGFPE);                                \
+  sigdelset(&bl_mask, SIGILL);                                \
+  sigdelset(&bl_mask, SIGSEGV);                               \
+  sigdelset(&bl_mask, SIGBUS);                                \
+  sigdelset(&bl_mask, SIGKILL);                               \
+  pthread_sigmask(SIG_SETMASK, &bl_mask, &(DRIVER)->sa_mask); \
+} while (0)
+
+#define RESTORE_BLOCKED_SIGSET(DRIVER) do {                   \
+  pthread_sigmask(SIG_SETMASK, &(DRIVER)->sa_mask, NULL);     \
+} while (0)
+
+#define PPTHREAD_MUTEX_LOCK(DRIVER) do {                      \
+  SET_BLOCKED_SIGSET(DRIVER);                                 \
+  pthread_mutex_lock(&(DRIVER)->ctxmutex);                    \
+} while (0)
+
+#define PPTHREAD_MUTEX_UNLOCK(DRIVER) do {                    \
+  pthread_mutex_unlock(&(DRIVER)->ctxmutex);                  \
+  RESTORE_BLOCKED_SIGSET(DRIVER);                             \
+} while (0)
+
+static void
+intel_driver_delete(intel_driver_t *driver)
+{
+  if (driver == NULL)
+    return;
+
+  if (driver->bufmgr)
+    drm_intel_bufmgr_destroy(driver->bufmgr);
+  cl_free(driver);
+}
+
+static intel_driver_t*
+intel_driver_new(void)
+{
+  intel_driver_t *driver = NULL;
+
+  TRY_ALLOC_NO_ERR (driver, CALLOC(intel_driver_t));
+  driver->fd = -1;
+
+exit:
+  return driver;
+error:
+  intel_driver_delete(driver);
+  driver = NULL;
+  goto exit;
+}
+
+/* just used for maximum relocation number in drm_intel */
+#define BATCH_SIZE 0x4000
+
+static void
+intel_driver_memman_init(intel_driver_t *driver)
+{
+  driver->bufmgr = drm_intel_bufmgr_gem_init(driver->fd, BATCH_SIZE);
+  assert(driver->bufmgr);
+  //drm_intel_bufmgr_gem_set_aub_dump(driver->bufmgr, 1);
+  drm_intel_bufmgr_gem_enable_reuse(driver->bufmgr);
+}
+
+static void
+intel_driver_context_init(intel_driver_t *driver)
+{
+  driver->ctx = drm_intel_gem_context_create(driver->bufmgr);
+  assert(driver->ctx);
+}
+
+static void
+intel_driver_context_destroy(intel_driver_t *driver)
+{
+  if(driver->ctx)
+    drm_intel_gem_context_destroy(driver->ctx);
+  driver->ctx = NULL;
+}
+
+static void 
+intel_driver_init(intel_driver_t *driver, int dev_fd)
+{
+  driver->fd = dev_fd;
+  driver->locked = 0;
+  pthread_mutex_init(&driver->ctxmutex, NULL);
+#ifndef NDEBUG
+  int res =
+#endif /* NDEBUG */
+  intel_driver_get_param(driver, I915_PARAM_CHIPSET_ID, &driver->device_id);
+  assert(res);
+  intel_driver_memman_init(driver);
+  intel_driver_context_init(driver);
+
+#if EMULATE_GEN
+  driver->gen_ver = EMULATE_GEN;
+  if (EMULATE_GEN == 75)
+    driver->device_id = PCI_CHIP_HASWELL_L;       /* we pick L for HSW */
+  else if (EMULATE_GEN == 7)
+    driver->device_id = PCI_CHIP_IVYBRIDGE_GT2; /* we pick GT2 for IVB */
+  else if (EMULATE_GEN == 6)
+    driver->device_id = PCI_CHIP_SANDYBRIDGE_GT2; /* we pick GT2 for SNB */
+  else
+    FATAL ("Unsupported Gen for emulation");
+#else
+  if (IS_GEN75(driver->device_id))
+    driver->gen_ver = 75;
+  else if (IS_GEN7(driver->device_id))
+    driver->gen_ver = 7;
+  else if (IS_GEN6(driver->device_id))
+    driver->gen_ver = 6;
+  else if(IS_IGDNG(driver->device_id))
+    driver->gen_ver = 5;
+  else
+    driver->gen_ver = 4;
+#endif /* EMULATE_GEN */
+}
+
+static cl_int
+intel_driver_open(intel_driver_t *intel, cl_context_prop props)
+{
+  int cardi;
+#ifdef HAS_X11
+  char *driver_name;
+#endif
+  if (props != NULL
+      && props->gl_type != CL_GL_NOSHARE
+      && props->gl_type != CL_GL_GLX_DISPLAY
+      && props->gl_type != CL_GL_EGL_DISPLAY) {
+    fprintf(stderr, "Unsupported gl share type %d.\n", props->gl_type);
+    return CL_INVALID_OPERATION;
+  }
+
+#ifdef HAS_X11
+  intel->x11_display = XOpenDisplay(NULL);
+
+  if(intel->x11_display) {
+    if((intel->dri_ctx = getDRI2State(intel->x11_display,
+                                     DefaultScreen(intel->x11_display),
+                                     &driver_name))) {
+      intel_driver_init_shared(intel, intel->dri_ctx);
+      Xfree(driver_name);
+    }
+    else
+      fprintf(stderr, "X server found. dri2 connection failed! \n");
+  }
+#endif
+
+  if(!intel_driver_is_active(intel)) {
+    char card_name[20];
+    for(cardi = 0; cardi < 16; cardi++) {
+      sprintf(card_name, "/dev/dri/renderD%d", 128+cardi);
+      if(intel_driver_init_render(intel, card_name))
+        break;
+    }
+  }
+
+  if(!intel_driver_is_active(intel)) {
+    char card_name[20];
+    for(cardi = 0; cardi < 16; cardi++) {
+      sprintf(card_name, "/dev/dri/card%d", cardi);
+      if(intel_driver_init_master(intel, card_name))
+        break;
+    }
+  }
+
+  if(!intel_driver_is_active(intel)) {
+    fprintf(stderr, "Device open failed, aborting...\n");
+    return CL_DEVICE_NOT_FOUND;
+  }
+
+#ifdef HAS_EGL
+  if (props && props->gl_type == CL_GL_EGL_DISPLAY) {
+    assert(props->egl_display);
+  }
+#endif
+  return CL_SUCCESS;
+}
+
+static void
+intel_driver_close(intel_driver_t *intel)
+{
+#ifdef HAS_X11
+  if(intel->dri_ctx) dri_state_release(intel->dri_ctx);
+  if(intel->x11_display) XCloseDisplay(intel->x11_display);
+#endif
+  if(intel->need_close) {
+    close(intel->fd);
+    intel->need_close = 0;
+  }
+  intel->dri_ctx = NULL;
+  intel->x11_display = NULL;
+  intel->fd = -1;
+}
+
+LOCAL int
+intel_driver_get_param(intel_driver_t *driver, int param, int *value)
+{
+  int ret;
+  struct drm_i915_getparam gp;
+
+  memset(&gp, 0, sizeof(struct drm_i915_getparam));
+  gp.param = param;
+  gp.value = value;
+
+  ret = drmCommandWriteRead(driver->fd, DRM_I915_GETPARAM, &gp, sizeof(gp));
+  return ret == 0;
+}
+
+LOCAL int
+intel_driver_is_active(intel_driver_t *driver) {
+  return driver->fd >= 0;
+}
+
+#ifdef HAS_X11
+LOCAL int 
+intel_driver_init_shared(intel_driver_t *driver, dri_state_t *state)
+{
+  assert(state);
+  if(state->driConnectedFlag != DRI2)
+    return 0;
+  intel_driver_init(driver, state->fd);
+  driver->need_close = 0;
+  return 1;
+}
+#endif
+
+LOCAL int
+intel_driver_init_master(intel_driver_t *driver, const char* dev_name)
+{
+  int dev_fd;
+
+  drm_client_t client;
+
+  // usually dev_name = "/dev/dri/card%d"
+  dev_fd = open(dev_name, O_RDWR);
+  if (dev_fd == -1) {
+    fprintf(stderr, "open(\"%s\", O_RDWR) failed: %s\n", dev_name, strerror(errno));
+    return 0;
+  }
+
+  // Check that we're authenticated
+  memset(&client, 0, sizeof(drm_client_t));
+  int ret = ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client);
+  if (ret == -1) {
+    fprintf(stderr, "ioctl(dev_fd, DRM_IOCTL_GET_CLIENT, &client) failed: %s\n", strerror(errno));
+    close(dev_fd);
+    return 0;
+  }
+
+  if (!client.auth) {
+    fprintf(stderr, "%s not authenticated\n", dev_name);
+    close(dev_fd);
+    return 0;
+  }
+
+  intel_driver_init(driver, dev_fd);
+  driver->need_close = 1;
+
+  return 1;
+}
+
+LOCAL int
+intel_driver_init_render(intel_driver_t *driver, const char* dev_name)
+{
+  int dev_fd;
+
+  // usually dev_name = "/dev/dri/renderD%d"
+  dev_fd = open(dev_name, O_RDWR);
+  if (dev_fd == -1)
+    return 0;
+
+  intel_driver_init(driver, dev_fd);
+  driver->need_close = 1;
+
+  return 1;
+}
+
+LOCAL int 
+intel_driver_terminate(intel_driver_t *driver)
+{
+  pthread_mutex_destroy(&driver->ctxmutex);
+
+  if(driver->need_close) {
+    close(driver->fd);
+    driver->need_close = 0;
+  }
+  driver->fd = -1;
+  return 1;
+}
+
+LOCAL void
+intel_driver_lock_hardware(intel_driver_t *driver)
+{
+
+  PPTHREAD_MUTEX_LOCK(driver);
+  assert(!driver->locked);
+  driver->locked = 1;
+}
+
+LOCAL void 
+intel_driver_unlock_hardware(intel_driver_t *driver)
+{
+  driver->locked = 0;
+  PPTHREAD_MUTEX_UNLOCK(driver);
+}
+
+LOCAL dri_bo*
+intel_driver_share_buffer(intel_driver_t *driver, const char *sname, uint32_t name)
+{
+  dri_bo *bo = intel_bo_gem_create_from_name(driver->bufmgr,
+                                             sname,
+                                             name);
+  return bo;
+}
+
+LOCAL uint32_t
+intel_driver_shared_name(intel_driver_t *driver, dri_bo *bo)
+{
+  uint32_t name;
+  assert(bo);
+  dri_bo_flink(bo, &name);
+  return name;
+}
+/* XXX a null props is ok? */
+static int
+intel_get_device_id(void)
+{
+  intel_driver_t *driver = NULL;
+  int intel_device_id;
+
+  driver = intel_driver_new();
+  assert(driver != NULL);
+  if(UNLIKELY(intel_driver_open(driver, NULL) != CL_SUCCESS)) return INVALID_CHIP_ID;
+  intel_device_id = driver->device_id;
+  intel_driver_context_destroy(driver);
+  intel_driver_close(driver);
+  intel_driver_terminate(driver);
+  intel_driver_delete(driver);
+
+  return intel_device_id;
+}
+
+static void
+cl_intel_driver_delete(intel_driver_t *driver)
+{
+  if (driver == NULL)
+    return;
+  intel_driver_context_destroy(driver);
+  intel_driver_close(driver);
+  intel_driver_terminate(driver);
+  intel_driver_delete(driver);
+}
+
+#include "cl_gbe_loader.h"
+static intel_driver_t*
+cl_intel_driver_new(cl_context_prop props)
+{
+  intel_driver_t *driver = NULL;
+  TRY_ALLOC_NO_ERR (driver, intel_driver_new());
+  if(UNLIKELY(intel_driver_open(driver, props) != CL_SUCCESS)) goto error;
+  intel_driver_open(driver, props);
+exit:
+  return driver;
+error:
+  cl_intel_driver_delete(driver);
+  driver = NULL;
+  goto exit;
+}
+
+static drm_intel_bufmgr*
+intel_driver_get_bufmgr(intel_driver_t *drv)
+{
+  return drv->bufmgr;
+}
+
+static uint32_t
+intel_driver_get_ver(struct intel_driver *drv)
+{
+  return drv->gen_ver;
+}
+
+static size_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; }
+static void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; }
+
+static int get_cl_tiling(uint32_t drm_tiling)
+{
+  switch(drm_tiling) {
+  case I915_TILING_X: return CL_TILE_X;
+  case I915_TILING_Y: return CL_TILE_Y;
+  case I915_TILING_NONE: return CL_NO_TILE;
+  default:
+    assert(0);
+  }
+  return CL_NO_TILE;
+}
+
+#if defined(HAS_EGL)
+#include "intel_dri_resource_sharing.h"
+#include "cl_image.h"
+static int cl_get_clformat_from_texture(GLint tex_format, cl_image_format * cl_format)
+{
+  cl_int ret = CL_SUCCESS;
+
+  switch (tex_format) {
+  case GL_RGBA8:
+  case GL_RGBA:
+  case GL_RGBA16:
+  case GL_RGBA8I:
+  case GL_RGBA16I:
+  case GL_RGBA32I:
+  case GL_RGBA8UI:
+  case GL_RGBA16UI:
+  case GL_RGBA32UI:
+  case GL_RGBA16F:
+  case GL_RGBA32F:
+    cl_format->image_channel_order = CL_RGBA;
+    break;
+  case GL_BGRA:
+    cl_format->image_channel_order = CL_BGRA;
+    break;
+  default:
+    ret = -1;
+    goto error;
+  }
+
+  switch (tex_format) {
+  case GL_RGBA8:
+  case GL_RGBA:
+  case GL_BGRA:
+    cl_format->image_channel_data_type = CL_UNORM_INT8;
+    break;
+  case GL_RGBA16:
+    cl_format->image_channel_data_type = CL_UNORM_INT16;
+    break;
+  case GL_RGBA8I:
+    cl_format->image_channel_data_type = CL_SIGNED_INT8;
+    break;
+  case GL_RGBA16I:
+    cl_format->image_channel_data_type = CL_SIGNED_INT16;
+    break;
+  case GL_RGBA32I:
+    cl_format->image_channel_data_type = CL_SIGNED_INT32;
+    break;
+  case GL_RGBA8UI:
+    cl_format->image_channel_data_type = CL_UNSIGNED_INT8;
+    break;
+  case GL_RGBA16UI:
+    cl_format->image_channel_data_type = CL_UNSIGNED_INT16;
+    break;
+  case GL_RGBA32UI:
+    cl_format->image_channel_data_type = CL_UNSIGNED_INT32;
+    break;
+  case GL_RGBA16F:
+    cl_format->image_channel_data_type = CL_HALF_FLOAT;
+    break;
+  case GL_RGBA32F:
+    cl_format->image_channel_order = CL_FLOAT;
+    break;
+  default:
+    ret = -1;
+    goto error;
+  }
+
+error:
+  return ret;
+}
+
+static int
+get_mem_type_from_target(GLenum texture_target, cl_mem_object_type *type)
+{
+  switch(texture_target) {
+  case GL_TEXTURE_1D: *type = CL_MEM_OBJECT_IMAGE1D; break;
+  case GL_TEXTURE_2D: *type = CL_MEM_OBJECT_IMAGE2D; break;
+  case GL_TEXTURE_3D: *type = CL_MEM_OBJECT_IMAGE3D; break;
+  case GL_TEXTURE_1D_ARRAY: *type = CL_MEM_OBJECT_IMAGE1D_ARRAY; break;
+  case GL_TEXTURE_2D_ARRAY: *type = CL_MEM_OBJECT_IMAGE2D_ARRAY; break;
+  default:
+    return -1;
+  }
+  return CL_SUCCESS;
+}
+
+static cl_buffer
+intel_alloc_buffer_from_texture_egl(cl_context ctx, unsigned int target,
+                                    int miplevel, unsigned int texture,
+                                    struct _cl_mem_image *image)
+{
+  cl_buffer bo = (cl_buffer) NULL;
+  struct _intel_dri_share_image_region region;
+  unsigned int bpp, intel_fmt;
+  cl_image_format cl_format;
+  EGLBoolean ret;
+  EGLint attrib_list[] = { EGL_GL_TEXTURE_ID_MESA, texture,
+                           EGL_GL_TEXTURE_LEVEL_MESA, miplevel,
+                           EGL_GL_TEXTURE_TARGET_MESA, target,
+                           EGL_NONE};
+  ret = eglAcquireResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx),
+                               EGL_GL_TEXTURE_MESA,
+                               &attrib_list[0], &region);
+  if (!ret)
+      goto out;
+
+  bo = (cl_buffer)intel_driver_share_buffer((intel_driver_t *)ctx->drv, "rendering buffer", region.name);
+
+  if (bo == NULL) {
+    eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
+    goto out;
+  }
+  region.tiling = get_cl_tiling(region.tiling);
+  if (cl_get_clformat_from_texture(region.gl_format, &cl_format) != 0)
+    goto error;
+
+  if (cl_image_byte_per_pixel(&cl_format, &bpp) != CL_SUCCESS)
+    goto error;
+  intel_fmt = cl_image_get_intel_format(&cl_format);
+  if (intel_fmt == INTEL_UNSUPPORTED_FORMAT)
+    goto error;
+  cl_mem_object_type image_type;
+  if (get_mem_type_from_target(target, &image_type) != 0)
+    goto error;
+
+  cl_mem_image_init(image, region.w, region.h,
+                    image_type, region.depth, cl_format,
+                    intel_fmt, bpp, region.row_pitch,
+                    region.slice_pitch, region.tiling,
+                    region.tile_x, region.tile_y, region.offset);
+out:
+  return bo;
+
+error:
+  cl_buffer_unreference(bo);
+  eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
+  return NULL;
+}
+
+static cl_buffer
+intel_alloc_buffer_from_texture(cl_context ctx, unsigned int target,
+                                int miplevel, unsigned int texture,
+                                struct _cl_mem_image *image)
+{
+
+  if (IS_EGL_CONTEXT(ctx))
+    return intel_alloc_buffer_from_texture_egl(ctx, target, miplevel, texture, image);
+
+  return NULL;
+}
+
+static int
+intel_release_buffer_from_texture(cl_context ctx, unsigned int target,
+                                  int miplevel, unsigned int texture)
+{
+  if (IS_EGL_CONTEXT(ctx)) {
+    EGLint attrib_list[] = { EGL_GL_TEXTURE_ID_MESA, texture,
+                           EGL_GL_TEXTURE_LEVEL_MESA, miplevel,
+                           EGL_GL_TEXTURE_TARGET_MESA, target,
+                           EGL_NONE};
+
+    eglReleaseResourceMESA(EGL_DISP(ctx), EGL_CTX(ctx), EGL_GL_TEXTURE_MESA, &attrib_list[0]);
+    return CL_SUCCESS;
+  }
+  return -1;
+}
+#endif
+
+cl_buffer intel_share_buffer_from_libva(cl_context ctx,
+                                        unsigned int bo_name,
+                                        size_t *sz)
+{
+  drm_intel_bo *intel_bo;
+
+  intel_bo = intel_driver_share_buffer((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
+
+  if (sz)
+    *sz = intel_bo->size;
+
+  return (cl_buffer)intel_bo;
+}
+
+cl_buffer intel_share_image_from_libva(cl_context ctx,
+                                       unsigned int bo_name,
+                                       struct _cl_mem_image *image,
+                                       unsigned int offset)
+{
+  drm_intel_bo *intel_bo;
+  uint32_t intel_tiling, intel_swizzle_mode;
+
+  intel_bo = intel_driver_share_buffer((intel_driver_t *)ctx->drv, "shared from libva", bo_name);
+
+  intel_bo->offset += offset;
+  drm_intel_bo_get_tiling(intel_bo, &intel_tiling, &intel_swizzle_mode);
+  image->tiling = get_cl_tiling(intel_tiling);
+
+  return (cl_buffer)intel_bo;
+}
+
+static int32_t get_intel_tiling(cl_int tiling, uint32_t *intel_tiling)
+{
+  switch (tiling) {
+    case CL_NO_TILE:
+      *intel_tiling = I915_TILING_NONE;
+      break;
+    case CL_TILE_X:
+      *intel_tiling = I915_TILING_X;
+      break;
+    case CL_TILE_Y:
+      *intel_tiling = I915_TILING_Y;
+      break;
+    default:
+      assert(0);
+      return -1;
+  }
+  return 0;
+}
+
+static int intel_buffer_set_tiling(cl_buffer bo,
+                                   cl_image_tiling_t tiling, size_t stride)
+{
+  uint32_t intel_tiling;
+  int ret;
+  if (UNLIKELY((get_intel_tiling(tiling, &intel_tiling)) < 0))
+    return -1;
+#ifndef NDEBUG
+  uint32_t required_tiling;
+  required_tiling = intel_tiling;
+#endif
+  ret = drm_intel_bo_set_tiling((drm_intel_bo*)bo, &intel_tiling, stride);
+  assert(intel_tiling == required_tiling);
+  return ret;
+}
+
+LOCAL void
+intel_setup_callbacks(void)
+{
+  cl_driver_new = (cl_driver_new_cb *) cl_intel_driver_new;
+  cl_driver_delete = (cl_driver_delete_cb *) cl_intel_driver_delete;
+  cl_driver_get_ver = (cl_driver_get_ver_cb *) intel_driver_get_ver;
+  cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
+  cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
+  cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
+  cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
+#if defined(HAS_EGL)
+  cl_buffer_alloc_from_texture = (cl_buffer_alloc_from_texture_cb *) intel_alloc_buffer_from_texture;
+  cl_buffer_release_from_texture = (cl_buffer_release_from_texture_cb *) intel_release_buffer_from_texture;
+  intel_set_cl_gl_callbacks();
+#endif
+  cl_buffer_get_buffer_from_libva = (cl_buffer_get_buffer_from_libva_cb *) intel_share_buffer_from_libva;
+  cl_buffer_get_image_from_libva = (cl_buffer_get_image_from_libva_cb *) intel_share_image_from_libva;
+  cl_buffer_reference = (cl_buffer_reference_cb *) drm_intel_bo_reference;
+  cl_buffer_unreference = (cl_buffer_unreference_cb *) drm_intel_bo_unreference;
+  cl_buffer_map = (cl_buffer_map_cb *) drm_intel_bo_map;
+  cl_buffer_unmap = (cl_buffer_unmap_cb *) drm_intel_bo_unmap;
+  cl_buffer_map_gtt = (cl_buffer_map_gtt_cb *) drm_intel_gem_bo_map_gtt;
+  cl_buffer_unmap_gtt = (cl_buffer_unmap_gtt_cb *) drm_intel_gem_bo_unmap_gtt;
+  cl_buffer_map_gtt_unsync = (cl_buffer_map_gtt_unsync_cb *) drm_intel_gem_bo_map_unsynchronized;
+  cl_buffer_get_virtual = (cl_buffer_get_virtual_cb *) drm_intel_bo_get_virtual;
+  cl_buffer_get_size = (cl_buffer_get_size_cb *) drm_intel_bo_get_size;
+  cl_buffer_pin = (cl_buffer_pin_cb *) drm_intel_bo_pin;
+  cl_buffer_unpin = (cl_buffer_unpin_cb *) drm_intel_bo_unpin;
+  cl_buffer_subdata = (cl_buffer_subdata_cb *) drm_intel_bo_subdata;
+  cl_buffer_wait_rendering = (cl_buffer_wait_rendering_cb *) drm_intel_bo_wait_rendering;
+  cl_buffer_get_fd = (cl_buffer_get_fd_cb *) drm_intel_bo_gem_export_to_prime;
+  intel_set_gpgpu_callbacks(intel_get_device_id());
+}
diff --git a/src/intel/intel_driver.h b/src/intel/intel_driver.h
new file mode 100644
index 0000000..107fdfc
--- /dev/null
+++ b/src/intel/intel_driver.h
@@ -0,0 +1,125 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef _INTEL_DRIVER_H_
+#define _INTEL_DRIVER_H_
+
+#include "cl_device_data.h"
+
+#include <stdint.h>
+#include <pthread.h>
+#include <signal.h>
+
+#include <xf86drm.h>
+#include <drm.h>
+#include <i915_drm.h>
+#include <intel_bufmgr.h>
+
+#define CMD_MI                                  (0x0 << 29)
+#define CMD_2D                                  (0x2 << 29)
+
+#define MI_NOOP                                 (CMD_MI | 0)
+#define MI_BATCH_BUFFER_END                     (CMD_MI | (0xA << 23))
+#define MI_FLUSH                                (CMD_MI | (0x4 << 23))
+#define STATE_INSTRUCTION_CACHE_INVALIDATE      (0x1 << 0)
+
+#define XY_COLOR_BLT_CMD                        (CMD_2D | (0x50 << 22) | 0x04)
+#define XY_COLOR_BLT_WRITE_ALPHA                (1 << 21)
+#define XY_COLOR_BLT_WRITE_RGB                  (1 << 20)
+#define XY_COLOR_BLT_DST_TILED                  (1 << 11)
+
+/* BR13 */
+#define BR13_565                                (0x1 << 24)
+#define BR13_8888                               (0x3 << 24)
+
+struct dri_state;
+typedef struct _XDisplay Display;
+
+typedef struct intel_driver
+{
+  dri_bufmgr *bufmgr;
+  drm_intel_context *ctx;
+  int fd;
+  int device_id;
+  int gen_ver;
+  sigset_t sa_mask;
+  pthread_mutex_t ctxmutex;
+  int locked;
+  int need_close;
+  Display *x11_display;
+  struct dri_state *dri_ctx;
+} intel_driver_t;
+
+/* device control */
+extern void intel_driver_lock_hardware(intel_driver_t*);
+extern void intel_driver_unlock_hardware(intel_driver_t*);
+
+/* methods working in shared mode */
+extern dri_bo* intel_driver_share_buffer(intel_driver_t*, const char *sname, uint32_t name);
+extern uint32_t intel_driver_shared_name(intel_driver_t*, dri_bo*);
+
+/* init driver shared with X using dri state, acquired from X Display */
+extern int intel_driver_init_shared(intel_driver_t*, struct dri_state*);
+
+/* init driver in master mode (when X is not using the card) 
+ * usually dev_name = "/dev/dri/card0"
+ */
+extern int intel_driver_init_master(intel_driver_t*, const char* dev_name);
+
+/* init driver for render node */
+extern int intel_driver_init_render(intel_driver_t*, const char* dev_name);
+
+/* terminate driver and all underlying structures */
+extern int intel_driver_terminate(intel_driver_t*);
+
+/* simple check if driver was initialized (checking fd should suffice) */
+extern int intel_driver_is_active(intel_driver_t*);
+
+/* query device parameters using driver ioctl */
+extern int intel_driver_get_param(intel_driver_t*, int param, int *value);
+
+/* init the call backs used by the ocl driver */
+extern void intel_setup_callbacks(void);
+
+#endif /* _INTEL_DRIVER_H_ */
+
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
new file mode 100644
index 0000000..c4b9156
--- /dev/null
+++ b/src/intel/intel_gpgpu.c
@@ -0,0 +1,1513 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ *         Alexei Soupikov <alexei.soupikov at intel.com>
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <errno.h>
+
+#include "intel/intel_gpgpu.h"
+#include "intel/intel_defines.h"
+#include "intel/intel_structs.h"
+#include "intel/intel_batchbuffer.h"
+#include "intel/intel_driver.h"
+#include "program.h" // for BTI_RESERVED_NUM
+
+#include "cl_alloc.h"
+#include "cl_utils.h"
+#include "cl_sampler.h"
+
+#ifndef CL_VERSION_1_2
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#endif
+
+#define GEN_CMD_MEDIA_OBJECT  (0x71000000)
+#define MO_TS_BIT             (1 << 24)
+#define MO_RETAIN_BIT         (1 << 28)
+#define SAMPLER_STATE_SIZE    (16)
+
+#define TIMESTAMP_ADDR        0x2358
+
+/* Stores both binding tables and surface states */
+typedef struct surface_heap {
+  uint32_t binding_table[256];
+  char surface[256][sizeof(gen6_surface_state_t)];
+} surface_heap_t;
+
+typedef struct intel_event {
+  drm_intel_bo *buffer;
+  drm_intel_bo *ts_buf;
+  int status;
+} intel_event_t;
+
+#define MAX_IF_DESC    32
+
+/* We can bind only a limited number of buffers */
+enum { max_buf_n = 128 };
+
+enum { max_img_n = 128};
+
+enum {max_sampler_n = 16 };
+
+/* Handle GPGPU state */
+struct intel_gpgpu
+{
+  void* ker_opaque;
+  size_t global_wk_sz[3];
+  void* printf_info;
+  intel_driver_t *drv;
+  intel_batchbuffer_t *batch;
+  cl_gpgpu_kernel *ker;
+  drm_intel_bo *binded_buf[max_buf_n];  /* all buffers binded for the call */
+  uint32_t target_buf_offset[max_buf_n];/* internal offset for buffers binded for the call */
+  uint32_t binded_offset[max_buf_n];    /* their offsets in the curbe buffer */
+  uint32_t binded_n;                    /* number of buffers binded */
+
+  unsigned long img_bitmap;              /* image usage bitmap. */
+  unsigned int img_index_base;          /* base index for image surface.*/
+
+  unsigned long sampler_bitmap;          /* sampler usage bitmap. */
+
+  struct { drm_intel_bo *bo; } stack_b;
+  struct { drm_intel_bo *bo; } perf_b;
+  struct { drm_intel_bo *bo; } scratch_b;
+  struct { drm_intel_bo *bo; } constant_b;
+  struct { drm_intel_bo *bo; } time_stamp_b;  /* time stamp buffer */
+  struct { drm_intel_bo *bo;
+           drm_intel_bo *ibo;} printf_b;      /* the printf buf and index buf*/
+
+  struct { drm_intel_bo *bo; } aux_buf;
+  struct {
+    uint32_t surface_heap_offset;
+    uint32_t curbe_offset;
+    uint32_t idrt_offset;
+    uint32_t sampler_state_offset;
+    uint32_t sampler_border_color_state_offset;
+  } aux_offset;
+
+  uint32_t per_thread_scratch;
+  struct {
+    uint32_t num_cs_entries;
+    uint32_t size_cs_entry;  /* size of one entry in 512bit elements */
+  } curb;
+
+  uint32_t max_threads;      /* max threads requested by the user */
+};
+
+typedef struct intel_gpgpu intel_gpgpu_t;
+
+typedef void (intel_gpgpu_set_L3_t)(intel_gpgpu_t *gpgpu, uint32_t use_slm);
+intel_gpgpu_set_L3_t *intel_gpgpu_set_L3 = NULL;
+
+typedef uint32_t (intel_gpgpu_get_scratch_index_t)(uint32_t size);
+intel_gpgpu_get_scratch_index_t *intel_gpgpu_get_scratch_index = NULL;
+
+typedef void (intel_gpgpu_post_action_t)(intel_gpgpu_t *gpgpu, int32_t flush_mode);
+intel_gpgpu_post_action_t *intel_gpgpu_post_action = NULL;
+
+typedef uint64_t (intel_gpgpu_read_ts_reg_t)(drm_intel_bufmgr *bufmgr);
+intel_gpgpu_read_ts_reg_t *intel_gpgpu_read_ts_reg = NULL;
+
+static void
+intel_gpgpu_sync(void *buf)
+{
+  if (buf)
+    drm_intel_bo_wait_rendering((drm_intel_bo *)buf);
+}
+
+static void *intel_gpgpu_ref_batch_buf(intel_gpgpu_t *gpgpu)
+{
+  if (gpgpu->batch->last_bo)
+    drm_intel_bo_reference(gpgpu->batch->last_bo);
+
+  return gpgpu->batch->last_bo;
+}
+
+static void intel_gpgpu_unref_batch_buf(void *buf)
+{
+  if (buf)
+    drm_intel_bo_unreference((drm_intel_bo *)buf);
+}
+
+static void
+intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
+{
+  if (gpgpu == NULL)
+    return;
+  if(gpgpu->time_stamp_b.bo)
+    drm_intel_bo_unreference(gpgpu->time_stamp_b.bo);
+  if(gpgpu->printf_b.bo)
+    drm_intel_bo_unreference(gpgpu->printf_b.bo);
+  if(gpgpu->printf_b.ibo)
+    drm_intel_bo_unreference(gpgpu->printf_b.ibo);
+  if (gpgpu->aux_buf.bo)
+    drm_intel_bo_unreference(gpgpu->aux_buf.bo);
+  if (gpgpu->perf_b.bo)
+    drm_intel_bo_unreference(gpgpu->perf_b.bo);
+  if (gpgpu->stack_b.bo)
+    drm_intel_bo_unreference(gpgpu->stack_b.bo);
+  if (gpgpu->scratch_b.bo)
+    drm_intel_bo_unreference(gpgpu->scratch_b.bo);
+
+  if(gpgpu->constant_b.bo)
+    drm_intel_bo_unreference(gpgpu->constant_b.bo);
+
+  intel_batchbuffer_delete(gpgpu->batch);
+  cl_free(gpgpu);
+}
+
+static intel_gpgpu_t*
+intel_gpgpu_new(intel_driver_t *drv)
+{
+  intel_gpgpu_t *state = NULL;
+
+  TRY_ALLOC_NO_ERR (state, CALLOC(intel_gpgpu_t));
+  state->drv = drv;
+  state->batch = intel_batchbuffer_new(state->drv);
+  assert(state->batch);
+
+exit:
+  return state;
+error:
+  intel_gpgpu_delete(state);
+  state = NULL;
+  goto exit;
+}
+
+static void
+intel_gpgpu_select_pipeline(intel_gpgpu_t *gpgpu)
+{
+  BEGIN_BATCH(gpgpu->batch, 1);
+  OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
+static uint32_t
+intel_gpgpu_get_cache_ctrl_gen7()
+{
+  return cc_llc_l3;
+}
+
+static uint32_t
+intel_gpgpu_get_cache_ctrl_gen75()
+{
+  return llccc_ec | l3cc_ec;
+}
+
+static void
+intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
+{
+  const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */
+  BEGIN_BATCH(gpgpu->batch, 10);
+  OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 8);
+  /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
+  OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY);    /* General State Base Addr   */
+  /* 0, State Mem Obj CC */
+  /* We use a state base address for the surface heap since IVB clamp the
+   * binding table pointer at 11 bits. So, we cannot use pointers directly while
+   * using the surface heap
+   */
+  assert(gpgpu->aux_offset.surface_heap_offset % 4096 == 0);
+  OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo,
+            I915_GEM_DOMAIN_INSTRUCTION,
+            I915_GEM_DOMAIN_INSTRUCTION,
+            gpgpu->aux_offset.surface_heap_offset + (0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY));
+  OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Dynamic State Base Addr */
+  OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
+  OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr  */
+  /* If we output an AUB file, we limit the total size to 64MB */
+#if USE_FULSIM
+  OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound */
+  OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound */
+  OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound */
+  OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */
+#else
+  OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+  /* According to mesa i965 driver code, we must set the dynamic state access upper bound
+   * to a valid bound value, otherwise, the border color pointer may be rejected and you
+   * may get incorrect border color. This is a known hardware bug. */
+  OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+  OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+  OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+#endif /* USE_FULSIM */
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
+uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t size) {
+  return size / 1024 - 1;
+}
+
+uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) {
+    size = size >> 11;
+    uint32_t index = 0;
+    while((size >>= 1) > 0)
+      index++;   //get leading one
+
+    //non pow 2 size
+    if(size & (size - 1)) index++;
+    return index;
+}
+
+static cl_int
+intel_gpgpu_get_max_curbe_size(uint32_t device_id)
+{
+  if (IS_BAYTRAIL_T(device_id) ||
+      IS_IVB_GT1(device_id))
+    return 992;
+  else
+    return 2016;
+}
+
+static cl_int
+intel_gpgpu_get_curbe_size(intel_gpgpu_t *gpgpu)
+{
+  int curbe_size = gpgpu->curb.size_cs_entry * gpgpu->curb.num_cs_entries;
+  int max_curbe_size = intel_gpgpu_get_max_curbe_size(gpgpu->drv->device_id);
+
+  if (curbe_size > max_curbe_size) {
+    fprintf(stderr, "warning, curbe size exceed limitation.\n");
+    return max_curbe_size;
+  } else
+    return curbe_size;
+}
+
+static void
+intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
+{
+  int32_t scratch_index;
+  BEGIN_BATCH(gpgpu->batch, 8);
+  OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
+
+  if(gpgpu->per_thread_scratch > 0) {
+    scratch_index = intel_gpgpu_get_scratch_index(gpgpu->per_thread_scratch);
+    OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
+              I915_GEM_DOMAIN_RENDER,
+              I915_GEM_DOMAIN_RENDER,
+              scratch_index);
+  }
+  else {
+    OUT_BATCH(gpgpu->batch, 0);
+  }
+  /* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */
+  OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (0 << 8) | 0xc4);
+  OUT_BATCH(gpgpu->batch, 0);
+  /* curbe_size */
+  OUT_BATCH(gpgpu->batch, intel_gpgpu_get_curbe_size(gpgpu));
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, 0);
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+intel_gpgpu_load_curbe_buffer(intel_gpgpu_t *gpgpu)
+{
+  BEGIN_BATCH(gpgpu->batch, 4);
+  OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
+  OUT_BATCH(gpgpu->batch, 0);                     /* mbz */
+  OUT_BATCH(gpgpu->batch, intel_gpgpu_get_curbe_size(gpgpu) * 32);
+  OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux_offset.curbe_offset);
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+intel_gpgpu_load_idrt(intel_gpgpu_t *gpgpu)
+{
+  BEGIN_BATCH(gpgpu->batch, 4);
+  OUT_BATCH(gpgpu->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
+  OUT_BATCH(gpgpu->batch, 0);                    /* mbz */
+  OUT_BATCH(gpgpu->batch, 1 << 5);
+  OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux_offset.idrt_offset);
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
+static const uint32_t gpgpu_l3_config_reg1[] = {
+  0x00080040, 0x02040040, 0x00800040, 0x01000038,
+  0x02000030, 0x01000038, 0x00000038, 0x00000040,
+  0x0A140091, 0x09100091, 0x08900091, 0x08900091,
+  0x010000a1
+};
+
+static const uint32_t gpgpu_l3_config_reg2[] = {
+  0x00000000, 0x00000000, 0x00080410, 0x00080410,
+  0x00040410, 0x00040420, 0x00080420, 0x00080020,
+  0x00204080, 0x00244890, 0x00284490, 0x002444A0,
+  0x00040810
+};
+
+/* Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. */
+static void
+intel_gpgpu_write_timestamp(intel_gpgpu_t *gpgpu, int idx)
+{
+  BEGIN_BATCH(gpgpu->batch, 5);
+  OUT_BATCH(gpgpu->batch, CMD_PIPE_CONTROL | (5-2));
+  OUT_BATCH(gpgpu->batch, GEN7_PIPE_CONTROL_WRITE_TIMESTAMP);
+  OUT_RELOC(gpgpu->batch, gpgpu->time_stamp_b.bo,
+          I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+          GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE | idx * sizeof(uint64_t));
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, 0);
+  ADVANCE_BATCH();
+}
+
+static void
+intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
+{
+  gen6_pipe_control_t* pc = (gen6_pipe_control_t*)
+    intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_pipe_control_t));
+  memset(pc, 0, sizeof(*pc));
+  pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2;
+  pc->dw0.instruction_subopcode = GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL;
+  pc->dw0.instruction_opcode = GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL;
+  pc->dw0.instruction_pipeline = GEN7_PIPE_CONTROL_3D;
+  pc->dw0.instruction_type = GEN7_PIPE_CONTROL_INSTRUCTION_GFX;
+  pc->dw1.render_target_cache_flush_enable = 1;
+  pc->dw1.texture_cache_invalidation_enable = 1;
+  pc->dw1.cs_stall = 1;
+  pc->dw1.dc_flush_enable = 1;
+  //pc->dw1.instruction_cache_invalidate_enable = 1;
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm)
+{
+  BEGIN_BATCH(gpgpu->batch, 9);
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+  OUT_BATCH(gpgpu->batch, 0x00730000);
+
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+
+  if (use_slm)
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]);
+  else
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
+
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
+  if (use_slm)
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
+  else
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
+  ADVANCE_BATCH(gpgpu->batch);
+
+  intel_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+intel_gpgpu_set_L3_baytrail(intel_gpgpu_t *gpgpu, uint32_t use_slm)
+{
+  BEGIN_BATCH(gpgpu->batch, 9);
+
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+  OUT_BATCH(gpgpu->batch, 0x00D30000);    /* General credit : High credit = 26 : 6 */
+
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+  if (use_slm)
+    OUT_BATCH(gpgpu->batch, 0x01020021);  /* {SLM=64, URB=96, DC=16, RO=16, Sum=192} */
+  else
+    OUT_BATCH(gpgpu->batch, 0x02040040);  /* {SLM=0, URB=128, DC=32, RO=32, Sum=192} */
+
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
+  OUT_BATCH(gpgpu->batch, 0x0);           /* {I/S=0, Const=0, Tex=0} */
+
+  ADVANCE_BATCH(gpgpu->batch);
+
+  intel_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm)
+{
+  /* still set L3 in batch buffer for fulsim. */
+  BEGIN_BATCH(gpgpu->batch, 9);
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET);
+  OUT_BATCH(gpgpu->batch, 0x00610000);
+
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET);
+
+  if (use_slm)
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]);
+  else
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
+
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET);
+  if (use_slm)
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]);
+  else
+    OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
+    ADVANCE_BATCH(gpgpu->batch);
+
+  //if(use_slm)
+  //  gpgpu->batch->enable_slm = 1;
+  intel_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
+{
+  intel_batchbuffer_start_atomic(gpgpu->batch, 256);
+  intel_gpgpu_pipe_control(gpgpu);
+  assert(intel_gpgpu_set_L3);
+  intel_gpgpu_set_L3(gpgpu, gpgpu->ker->use_slm);
+  intel_gpgpu_select_pipeline(gpgpu);
+  intel_gpgpu_set_base_address(gpgpu);
+  intel_gpgpu_load_vfe_state(gpgpu);
+  intel_gpgpu_load_curbe_buffer(gpgpu);
+  intel_gpgpu_load_idrt(gpgpu);
+
+  if (gpgpu->perf_b.bo) {
+    BEGIN_BATCH(gpgpu->batch, 3);
+    OUT_BATCH(gpgpu->batch,
+              (0x28 << 23) | /* MI_REPORT_PERF_COUNT */
+              (3 - 2));      /* length-2 */
+    OUT_RELOC(gpgpu->batch, gpgpu->perf_b.bo,
+              I915_GEM_DOMAIN_RENDER,
+              I915_GEM_DOMAIN_RENDER,
+              0 |  /* Offset for the start "counters" */
+              1);  /* Use GTT and not PGTT */
+    OUT_BATCH(gpgpu->batch, 0);
+    ADVANCE_BATCH(gpgpu->batch);
+  }
+
+  /* Insert PIPE_CONTROL for time stamp of start*/
+  if (gpgpu->time_stamp_b.bo)
+    intel_gpgpu_write_timestamp(gpgpu, 0);
+}
+
+static void
+intel_gpgpu_post_action_gen7(intel_gpgpu_t *gpgpu, int32_t flush_mode)
+{
+  if(flush_mode)
+    intel_gpgpu_pipe_control(gpgpu);
+}
+
+static void
+intel_gpgpu_post_action_gen75(intel_gpgpu_t *gpgpu, int32_t flush_mode)
+{
+  /* flush force for set L3 */
+  intel_gpgpu_pipe_control(gpgpu);
+
+  /* Restore L3 control to disable SLM mode,
+     otherwise, may affect 3D pipeline */
+  intel_gpgpu_set_L3(gpgpu, 0);
+}
+
+static void
+intel_gpgpu_batch_end(intel_gpgpu_t *gpgpu, int32_t flush_mode)
+{
+  /* Insert PIPE_CONTROL for time stamp of end*/
+  if (gpgpu->time_stamp_b.bo)
+    intel_gpgpu_write_timestamp(gpgpu, 1);
+
+  /* Insert the performance counter command */
+  if (gpgpu->perf_b.bo) {
+    BEGIN_BATCH(gpgpu->batch, 3);
+    OUT_BATCH(gpgpu->batch,
+              (0x28 << 23) | /* MI_REPORT_PERF_COUNT */
+              (3 - 2));      /* length-2 */
+    OUT_RELOC(gpgpu->batch, gpgpu->perf_b.bo,
+              I915_GEM_DOMAIN_RENDER,
+              I915_GEM_DOMAIN_RENDER,
+              512 |  /* Offset for the end "counters" */
+              1);    /* Use GTT and not PGTT */
+    OUT_BATCH(gpgpu->batch, 0);
+    ADVANCE_BATCH(gpgpu->batch);
+  }
+
+  intel_gpgpu_post_action(gpgpu, flush_mode);
+  intel_batchbuffer_end_atomic(gpgpu->batch);
+}
+
+static int
+intel_gpgpu_batch_reset(intel_gpgpu_t *gpgpu, size_t sz)
+{
+  return intel_batchbuffer_reset(gpgpu->batch, sz);
+}
+/* check we do not get a 0 starting address for binded buf */
+static void
+intel_gpgpu_check_binded_buf_address(intel_gpgpu_t *gpgpu)
+{
+  uint32_t i;
+  for (i = 0; i < gpgpu->binded_n; ++i)
+    assert(gpgpu->binded_buf[i]->offset != 0);
+}
+
+static void
+intel_gpgpu_flush_batch_buffer(intel_batchbuffer_t *batch)
+{
+  assert(batch);
+  intel_batchbuffer_emit_mi_flush(batch);
+  intel_batchbuffer_flush(batch);
+}
+
+static void
+intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
+{
+  if (!gpgpu->batch || !gpgpu->batch->buffer)
+    return;
+  intel_gpgpu_flush_batch_buffer(gpgpu->batch);
+  intel_gpgpu_check_binded_buf_address(gpgpu);
+}
+
+static int
+intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
+                       uint32_t max_threads,
+                       uint32_t size_cs_entry,
+                       int profiling)
+{
+  drm_intel_bo *bo;
+
+  /* Binded buffers */
+  gpgpu->binded_n = 0;
+  gpgpu->img_bitmap = 0;
+  gpgpu->img_index_base = 3;
+  gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1);
+
+  /* URB */
+  gpgpu->curb.num_cs_entries = 64;
+  gpgpu->curb.size_cs_entry = size_cs_entry;
+  gpgpu->max_threads = max_threads;
+
+  if (gpgpu->printf_b.ibo)
+    dri_bo_unreference(gpgpu->printf_b.ibo);
+  gpgpu->printf_b.ibo = NULL;
+  if (gpgpu->printf_b.bo)
+    dri_bo_unreference(gpgpu->printf_b.bo);
+  gpgpu->printf_b.bo = NULL;
+
+  /* Set the profile buffer*/
+  if(gpgpu->time_stamp_b.bo)
+    dri_bo_unreference(gpgpu->time_stamp_b.bo);
+  gpgpu->time_stamp_b.bo = NULL;
+  if (profiling) {
+    bo = dri_bo_alloc(gpgpu->drv->bufmgr, "timestamp query", 4096, 4096);
+    gpgpu->time_stamp_b.bo = bo;
+    if (!bo)
+      fprintf(stderr, "Could not allocate buffer for profiling.\n");
+  }
+
+  /* stack */
+  if (gpgpu->stack_b.bo)
+    dri_bo_unreference(gpgpu->stack_b.bo);
+  gpgpu->stack_b.bo = NULL;
+
+  /* Set the auxiliary buffer*/
+  uint32_t size_aux = 0;
+  if(gpgpu->aux_buf.bo)
+    dri_bo_unreference(gpgpu->aux_buf.bo);
+  gpgpu->aux_buf.bo = NULL;
+
+  //surface heap must be 4096 bytes aligned because state base address use 20bit for the address
+  size_aux = ALIGN(size_aux, 4096);
+  gpgpu->aux_offset.surface_heap_offset = size_aux;
+  size_aux += sizeof(surface_heap_t);
+
+  //curbe must be 32 bytes aligned
+  size_aux = ALIGN(size_aux, 32);
+  gpgpu->aux_offset.curbe_offset = size_aux;
+  size_aux += gpgpu->curb.num_cs_entries * gpgpu->curb.size_cs_entry * 32;
+
+  //idrt must be 32 bytes aligned
+  size_aux = ALIGN(size_aux, 32);
+  gpgpu->aux_offset.idrt_offset = size_aux;
+  size_aux += MAX_IF_DESC * sizeof(struct gen6_interface_descriptor);
+
+  //sampler state must be 32 bytes aligned
+  size_aux = ALIGN(size_aux, 32);
+  gpgpu->aux_offset.sampler_state_offset = size_aux;
+  size_aux += GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t);
+
+  //sampler border color state must be 32 bytes aligned
+  size_aux = ALIGN(size_aux, 32);
+  gpgpu->aux_offset.sampler_border_color_state_offset = size_aux;
+  size_aux += GEN_MAX_SAMPLERS * sizeof(gen7_sampler_border_color_t);
+
+  bo = dri_bo_alloc(gpgpu->drv->bufmgr, "AUX_BUFFER", size_aux, 0);
+  if (!bo || dri_bo_map(bo, 1) != 0) {
+    fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
+    if (bo)
+      dri_bo_unreference(bo);
+    if (profiling && gpgpu->time_stamp_b.bo)
+      dri_bo_unreference(gpgpu->time_stamp_b.bo);
+    gpgpu->time_stamp_b.bo = NULL;
+    return -1;
+  }
+  memset(bo->virtual, 0, size_aux);
+  gpgpu->aux_buf.bo = bo;
+  return 0;
+}
+
+static void
+intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset)
+{
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  heap->binding_table[index] = offsetof(surface_heap_t, surface) +
+                               index * sizeof(gen7_surface_state_t);
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                    I915_GEM_DOMAIN_RENDER,
+                    I915_GEM_DOMAIN_RENDER,
+                    obj_bo_offset,
+                    gpgpu->aux_offset.surface_heap_offset +
+                    heap->binding_table[index] +
+                    offsetof(gen7_surface_state_t, ss1),
+                    obj_bo);
+}
+
+static dri_bo*
+intel_gpgpu_alloc_constant_buffer_gen7(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
+{
+  uint32_t s = size - 1;
+  assert(size != 0);
+
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[bti];
+  memset(ss2, 0, sizeof(gen7_surface_state_t));
+  ss2->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss2->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_UINT;
+  ss2->ss2.width  = s & 0x7f;            /* bits 6:0 of sz */
+  ss2->ss2.height = (s >> 7) & 0x3fff;   /* bits 20:7 of sz */
+  ss2->ss3.depth  = (s >> 21) & 0x3ff;   /* bits 30:21 of sz */
+  ss2->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+  heap->binding_table[bti] = offsetof(surface_heap_t, surface) + bti* sizeof(gen7_surface_state_t);
+
+  if(gpgpu->constant_b.bo)
+    dri_bo_unreference(gpgpu->constant_b.bo);
+  gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", s, 64);
+  if (gpgpu->constant_b.bo == NULL)
+    return NULL;
+  ss2->ss1.base_addr = gpgpu->constant_b.bo->offset;
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                      I915_GEM_DOMAIN_RENDER,
+                      I915_GEM_DOMAIN_RENDER,
+                      0,
+                      gpgpu->aux_offset.surface_heap_offset +
+                      heap->binding_table[bti] +
+                      offsetof(gen7_surface_state_t, ss1),
+                      gpgpu->constant_b.bo);
+  return gpgpu->constant_b.bo;
+}
+
+static dri_bo*
+intel_gpgpu_alloc_constant_buffer_gen75(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
+{
+  uint32_t s = size - 1;
+  assert(size != 0);
+
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[bti];
+  memset(ss2, 0, sizeof(gen7_surface_state_t));
+  ss2->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss2->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_UINT;
+  ss2->ss2.width  = s & 0x7f;            /* bits 6:0 of sz */
+  ss2->ss2.height = (s >> 7) & 0x3fff;   /* bits 20:7 of sz */
+  ss2->ss3.depth  = (s >> 21) & 0x3ff;   /* bits 30:21 of sz */
+  ss2->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+  ss2->ss7.shader_r = I965_SURCHAN_SELECT_RED;
+  ss2->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
+  ss2->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
+  ss2->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA;
+  heap->binding_table[bti] = offsetof(surface_heap_t, surface) + bti* sizeof(gen7_surface_state_t);
+
+  if(gpgpu->constant_b.bo)
+    dri_bo_unreference(gpgpu->constant_b.bo);
+  gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", s, 64);
+  if (gpgpu->constant_b.bo == NULL)
+    return NULL;
+  ss2->ss1.base_addr = gpgpu->constant_b.bo->offset;
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                      I915_GEM_DOMAIN_RENDER,
+                      I915_GEM_DOMAIN_RENDER,
+                      0,
+                      gpgpu->aux_offset.surface_heap_offset +
+                      heap->binding_table[bti] +
+                      offsetof(gen7_surface_state_t, ss1),
+                      gpgpu->constant_b.bo);
+  return gpgpu->constant_b.bo;
+}
+
+static void
+intel_gpgpu_setup_bti(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset, uint32_t size, unsigned char index)
+{
+  uint32_t s = size - 1;
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen7_surface_state_t *ss0 = (gen7_surface_state_t *) heap->surface[index];
+  memset(ss0, 0, sizeof(gen7_surface_state_t));
+  ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss0->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+  ss0->ss2.width  = s & 0x7f;   /* bits 6:0 of sz */
+  ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+  ss0->ss3.depth  = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
+  ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+  heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen7_surface_state_t);
+
+  ss0->ss1.base_addr = buf->offset + internal_offset;
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                      I915_GEM_DOMAIN_RENDER,
+                      I915_GEM_DOMAIN_RENDER,
+                      internal_offset,
+                      gpgpu->aux_offset.surface_heap_offset +
+                      heap->binding_table[index] +
+                      offsetof(gen7_surface_state_t, ss1),
+                      buf);
+}
+
+
+static int
+intel_is_surface_array(cl_mem_object_type type)
+{
+  if (type == CL_MEM_OBJECT_IMAGE1D_ARRAY ||
+        type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+    return 1;
+
+  return 0;
+}
+
+static int
+intel_get_surface_type(cl_mem_object_type type)
+{
+  switch (type) {
+  case CL_MEM_OBJECT_IMAGE1D_BUFFER:
+  case CL_MEM_OBJECT_IMAGE1D:
+  case CL_MEM_OBJECT_IMAGE1D_ARRAY:
+    return I965_SURFACE_1D;
+
+  case CL_MEM_OBJECT_IMAGE2D:
+  case CL_MEM_OBJECT_IMAGE2D_ARRAY:
+    return I965_SURFACE_2D;
+
+  case CL_MEM_OBJECT_IMAGE3D:
+    return I965_SURFACE_3D;
+
+  default:
+      assert(0);
+  }
+  return 0;
+}
+
+/* Get fixed surface type. If it is a 1D array image with a large index,
+   we need to fixup it to 2D type due to a Gen7/Gen75's sampler issue
+   on a integer type surface with clamp address mode and nearest filter mode.
+*/
+static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, cl_mem_object_type type)
+{
+  uint32_t surface_type;
+  if (((IS_IVYBRIDGE(gpgpu->drv->device_id) || IS_HASWELL(gpgpu->drv->device_id))) &&
+      index >= 128 + BTI_RESERVED_NUM &&
+      type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+    surface_type = I965_SURFACE_2D;
+  else
+    surface_type = intel_get_surface_type(type);
+  return surface_type;
+}
+
+static void
+intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
+                              uint32_t index,
+                              dri_bo* obj_bo,
+                              uint32_t obj_bo_offset,
+                              uint32_t format,
+                              cl_mem_object_type type,
+                              int32_t w,
+                              int32_t h,
+                              int32_t depth,
+                              int32_t pitch,
+                              int32_t tiling)
+{
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
+
+  memset(ss, 0, sizeof(*ss));
+  ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
+  ss->ss0.surface_type = get_surface_type(gpgpu, index, type);
+  if (intel_is_surface_array(type)) {
+    ss->ss0.surface_array = 1;
+    ss->ss0.surface_array_spacing = 1;
+  }
+  ss->ss0.surface_format = format;
+  ss->ss1.base_addr = obj_bo->offset;
+  ss->ss2.width = w - 1;
+
+  ss->ss2.height = h - 1;
+  ss->ss3.depth = depth - 1;
+  ss->ss4.not_str_buf.rt_view_extent = depth - 1;
+  ss->ss4.not_str_buf.min_array_element = 0;
+  ss->ss3.pitch = pitch - 1;
+  ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+  if (tiling == GPGPU_TILE_X) {
+    ss->ss0.tiled_surface = 1;
+    ss->ss0.tile_walk = I965_TILEWALK_XMAJOR;
+  } else if (tiling == GPGPU_TILE_Y) {
+    ss->ss0.tiled_surface = 1;
+    ss->ss0.tile_walk = I965_TILEWALK_YMAJOR;
+  }
+  ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
+  intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
+
+  assert(index < GEN_MAX_SURFACES);
+}
+
+static void
+intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
+                              uint32_t index,
+                              dri_bo* obj_bo,
+                              uint32_t obj_bo_offset,
+                              uint32_t format,
+                              cl_mem_object_type type,
+                              int32_t w,
+                              int32_t h,
+                              int32_t depth,
+                              int32_t pitch,
+                              int32_t tiling)
+{
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
+  memset(ss, 0, sizeof(*ss));
+  ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
+  ss->ss0.surface_type = get_surface_type(gpgpu, index, type);
+  if (intel_is_surface_array(type)) {
+    ss->ss0.surface_array = 1;
+    ss->ss0.surface_array_spacing = 1;
+  }
+  ss->ss0.surface_format = format;
+  ss->ss1.base_addr = obj_bo->offset;
+  ss->ss2.width = w - 1;
+  ss->ss2.height = h - 1;
+  ss->ss3.depth = depth - 1;
+  ss->ss4.not_str_buf.rt_view_extent = depth - 1;
+  ss->ss4.not_str_buf.min_array_element = 0;
+  ss->ss3.pitch = pitch - 1;
+  ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+  ss->ss7.shader_r = I965_SURCHAN_SELECT_RED;
+  ss->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
+  ss->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
+  ss->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA;
+  if (tiling == GPGPU_TILE_X) {
+    ss->ss0.tiled_surface = 1;
+    ss->ss0.tile_walk = I965_TILEWALK_XMAJOR;
+  } else if (tiling == GPGPU_TILE_Y) {
+    ss->ss0.tiled_surface = 1;
+    ss->ss0.tile_walk = I965_TILEWALK_YMAJOR;
+  }
+  ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
+  intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo, obj_bo_offset);
+
+  assert(index < GEN_MAX_SURFACES);
+}
+
+static void
+intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
+                     uint32_t internal_offset, uint32_t size, uint8_t bti)
+{
+  assert(gpgpu->binded_n < max_buf_n);
+  gpgpu->binded_buf[gpgpu->binded_n] = buf;
+  gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset;
+  gpgpu->binded_offset[gpgpu->binded_n] = offset;
+  gpgpu->binded_n++;
+  intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti);
+}
+
+static int
+intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size)
+{
+  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+  drm_intel_bo* old = gpgpu->scratch_b.bo;
+  uint32_t total = per_thread_size * gpgpu->max_threads;
+  /* Per Bspec, scratch should 2X the desired size, otherwise luxmark may hang */
+  if (IS_HASWELL(gpgpu->drv->device_id))
+      total *= 2;
+
+  gpgpu->per_thread_scratch = per_thread_size;
+
+  if(old && old->size < total) {
+    drm_intel_bo_unreference(old);
+    old = NULL;
+  }
+
+  if(!old && total) {
+    gpgpu->scratch_b.bo = drm_intel_bo_alloc(bufmgr, "SCRATCH_BO", total, 4096);
+    if (gpgpu->scratch_b.bo == NULL)
+      return -1;
+  }
+  return 0;
+}
+static void
+intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint8_t bti)
+{
+  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+  gpgpu->stack_b.bo = drm_intel_bo_alloc(bufmgr, "STACK", size, 64);
+
+  intel_gpgpu_bind_buf(gpgpu, gpgpu->stack_b.bo, offset, 0, size, bti);
+}
+
+static void
+intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
+{
+  gen6_interface_descriptor_t *desc;
+  drm_intel_bo *ker_bo = NULL;
+
+  desc = (gen6_interface_descriptor_t*) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.idrt_offset);
+
+  memset(desc, 0, sizeof(*desc));
+  ker_bo = (drm_intel_bo *) kernel->bo;
+  desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
+  desc->desc1.single_program_flow = 0;
+  desc->desc1.floating_point_mode = 0; /* use IEEE-754 rule */
+  desc->desc5.rounding_mode = 0; /* round to nearest even */
+
+  assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) % 32 == 0);
+  desc->desc2.sampler_state_pointer = (gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) >> 5;
+  desc->desc3.binding_table_entry_count = 0; /* no prefetch */
+  desc->desc3.binding_table_pointer = 0;
+  desc->desc4.curbe_read_len = kernel->curbe_sz / 32;
+  desc->desc4.curbe_read_offset = 0;
+
+  /* Barriers / SLM are automatically handled on Gen7+ */
+  if (gpgpu->drv->gen_ver == 7 || gpgpu->drv->gen_ver == 75) {
+    size_t slm_sz = kernel->slm_sz;
+    desc->desc5.group_threads_num = kernel->use_slm ? kernel->thread_n : 0;
+    desc->desc5.barrier_enable = kernel->use_slm;
+    if (slm_sz <= 4*KB)
+      slm_sz = 4*KB;
+    else if (slm_sz <= 8*KB)
+      slm_sz = 8*KB;
+    else if (slm_sz <= 16*KB)
+      slm_sz = 16*KB;
+    else if (slm_sz <= 32*KB)
+      slm_sz = 32*KB;
+    else
+      slm_sz = 64*KB;
+    slm_sz = slm_sz >> 12;
+    desc->desc5.slm_sz = slm_sz;
+  }
+  else
+    desc->desc5.group_threads_num = kernel->barrierID; /* BarrierID on GEN6 */
+
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                    I915_GEM_DOMAIN_INSTRUCTION, 0,
+                    0,
+                    gpgpu->aux_offset.idrt_offset + offsetof(gen6_interface_descriptor_t, desc0),
+                    ker_bo);
+
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                    I915_GEM_DOMAIN_SAMPLER, 0,
+                    gpgpu->aux_offset.sampler_state_offset,
+                    gpgpu->aux_offset.idrt_offset + offsetof(gen6_interface_descriptor_t, desc2),
+                    gpgpu->aux_buf.bo);
+}
+
+static int
+intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
+{
+  unsigned char *curbe = NULL;
+  cl_gpgpu_kernel *k = gpgpu->ker;
+  uint32_t i, j;
+
+  /* Upload the data first */
+  if (dri_bo_map(gpgpu->aux_buf.bo, 1) != 0) {
+    fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
+    return -1;
+  }
+  assert(gpgpu->aux_buf.bo->virtual);
+  curbe = (unsigned char *) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.curbe_offset);
+  memcpy(curbe, data, size);
+
+  /* Now put all the relocations for our flat address space */
+  for (i = 0; i < k->thread_n; ++i)
+    for (j = 0; j < gpgpu->binded_n; ++j) {
+      *(uint32_t*)(curbe + gpgpu->binded_offset[j]+i*k->curbe_sz) = gpgpu->binded_buf[j]->offset + gpgpu->target_buf_offset[j];
+      drm_intel_bo_emit_reloc(gpgpu->aux_buf.bo,
+                              gpgpu->aux_offset.curbe_offset + gpgpu->binded_offset[j]+i*k->curbe_sz,
+                              gpgpu->binded_buf[j],
+                              gpgpu->target_buf_offset[j],
+                              I915_GEM_DOMAIN_RENDER,
+                              I915_GEM_DOMAIN_RENDER);
+    }
+  dri_bo_unmap(gpgpu->aux_buf.bo);
+  return 0;
+}
+
+static void
+intel_gpgpu_upload_samplers(intel_gpgpu_t *gpgpu, const void *data, uint32_t n)
+{
+  if (n) {
+    const size_t sz = n * sizeof(gen6_sampler_state_t);
+    memcpy(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset, data, sz);
+  }
+}
+
+int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
+{
+   switch( cl_address_mode ) {
+   case CLK_ADDRESS_NONE:
+   case CLK_ADDRESS_REPEAT:
+      return GEN_TEXCOORDMODE_WRAP;
+   case CLK_ADDRESS_CLAMP:
+      return GEN_TEXCOORDMODE_CLAMP_BORDER;
+   case CLK_ADDRESS_CLAMP_TO_EDGE:
+      return GEN_TEXCOORDMODE_CLAMP;
+   case CLK_ADDRESS_MIRRORED_REPEAT:
+      return GEN_TEXCOORDMODE_MIRROR;
+   default:
+      return GEN_TEXCOORDMODE_WRAP;
+   }
+}
+
+static void
+intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler)
+{
+  int using_nearest = 0;
+  uint32_t wrap_mode;
+  gen7_sampler_state_t *sampler;
+
+  sampler = (gen7_sampler_state_t *)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset)  + index;
+  memset(sampler, 0, sizeof(*sampler));
+  assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_border_color_state_offset) % 32 == 0);
+  sampler->ss2.default_color_pointer = (gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_border_color_state_offset) >> 5;
+  if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE)
+    sampler->ss3.non_normalized_coord = 1;
+  else
+    sampler->ss3.non_normalized_coord = 0;
+
+  switch (clk_sampler & __CLK_FILTER_MASK) {
+  case CLK_FILTER_NEAREST:
+    sampler->ss0.min_filter = GEN_MAPFILTER_NEAREST;
+    sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
+    sampler->ss0.mag_filter = GEN_MAPFILTER_NEAREST;
+    using_nearest = 1;
+    break;
+  case CLK_FILTER_LINEAR:
+    sampler->ss0.min_filter = GEN_MAPFILTER_LINEAR;
+    sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
+    sampler->ss0.mag_filter = GEN_MAPFILTER_LINEAR;
+    break;
+  }
+
+  wrap_mode = translate_wrap_mode(clk_sampler & __CLK_ADDRESS_MASK, using_nearest);
+  sampler->ss3.s_wrap_mode = wrap_mode;
+  /* XXX mesa i965 driver code point out that if the surface is a 1D surface, we may need
+   * to set t_wrap_mode to GEN_TEXCOORDMODE_WRAP. */
+  sampler->ss3.t_wrap_mode = wrap_mode;
+  sampler->ss3.r_wrap_mode = wrap_mode;
+
+  sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+  sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+  sampler->ss0.base_level = 0;
+
+  sampler->ss1.max_lod = 0;
+  sampler->ss1.min_lod = 0;
+
+  if (sampler->ss0.min_filter != GEN_MAPFILTER_NEAREST)
+     sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MIN |
+                                   GEN_ADDRESS_ROUNDING_ENABLE_V_MIN |
+                                   GEN_ADDRESS_ROUNDING_ENABLE_R_MIN;
+  if (sampler->ss0.mag_filter != GEN_MAPFILTER_NEAREST)
+     sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG |
+                                   GEN_ADDRESS_ROUNDING_ENABLE_V_MAG |
+                                   GEN_ADDRESS_ROUNDING_ENABLE_R_MAG;
+
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                    I915_GEM_DOMAIN_SAMPLER, 0,
+                    gpgpu->aux_offset.sampler_border_color_state_offset,
+                    gpgpu->aux_offset.sampler_state_offset +
+                    index * sizeof(gen7_sampler_state_t) +
+                    offsetof(gen7_sampler_state_t, ss2),
+                    gpgpu->aux_buf.bo);
+
+}
+
+static void
+intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
+{
+  int index;
+  assert(sampler_sz <= GEN_MAX_SAMPLERS);
+  for(index = 0; index < sampler_sz; index++)
+    intel_gpgpu_insert_sampler(gpgpu, index, samplers[index]);
+}
+
+static void
+intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
+{
+  gpgpu->ker = kernel;
+  intel_gpgpu_build_idrt(gpgpu, kernel);
+  dri_bo_unmap(gpgpu->aux_buf.bo);
+}
+
+static void
+intel_gpgpu_set_perf_counters(intel_gpgpu_t *gpgpu, cl_buffer *perf)
+{
+  if (gpgpu->perf_b.bo)
+    drm_intel_bo_unreference(gpgpu->perf_b.bo);
+  drm_intel_bo_reference((drm_intel_bo*) perf);
+  gpgpu->perf_b.bo = (drm_intel_bo*) perf;
+}
+
+static void
+intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
+                   uint32_t simd_sz,
+                   uint32_t thread_n,
+                   const size_t global_wk_off[3],
+                   const size_t global_wk_sz[3],
+                   const size_t local_wk_sz[3])
+{
+  const uint32_t global_wk_dim[3] = {
+    global_wk_sz[0] / local_wk_sz[0],
+    global_wk_sz[1] / local_wk_sz[1],
+    global_wk_sz[2] / local_wk_sz[2]
+  };
+  uint32_t right_mask = ~0x0;
+  size_t group_sz = local_wk_sz[0] * local_wk_sz[1] * local_wk_sz[2];
+
+  assert(simd_sz == 8 || simd_sz == 16);
+
+  uint32_t shift = (group_sz & (simd_sz - 1));
+  shift = (shift == 0) ? simd_sz : shift;
+  right_mask = (1 << shift) - 1;
+
+  BEGIN_BATCH(gpgpu->batch, 11);
+  OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 9);
+  OUT_BATCH(gpgpu->batch, 0);                        /* kernel index == 0 */
+  assert(thread_n <= 64);
+  if (simd_sz == 16)
+    OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
+  else
+    OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8  | thread max */
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[0]);
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[1]);
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[2]);
+  OUT_BATCH(gpgpu->batch, right_mask);
+  OUT_BATCH(gpgpu->batch, ~0x0);                     /* we always set height as 1, so set bottom mask as all 1*/
+  ADVANCE_BATCH(gpgpu->batch);
+
+  BEGIN_BATCH(gpgpu->batch, 2);
+  OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_FLUSH | 0);
+  OUT_BATCH(gpgpu->batch, 0);                        /* kernel index == 0 */
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
+static intel_event_t*
+intel_gpgpu_event_new(intel_gpgpu_t *gpgpu)
+{
+  intel_event_t *event = NULL;
+  TRY_ALLOC_NO_ERR (event, CALLOC(intel_event_t));
+
+  event->buffer = gpgpu->batch->buffer;
+  if (event->buffer)
+    drm_intel_bo_reference(event->buffer);
+  event->status = command_queued;
+
+  if(gpgpu->time_stamp_b.bo) {
+    event->ts_buf = gpgpu->time_stamp_b.bo;
+    drm_intel_bo_reference(event->ts_buf);
+  }
+
+exit:
+  return event;
+error:
+  cl_free(event);
+  event = NULL;
+  goto exit;
+}
+
+/*
+   The upper layer already flushed the batch buffer, just update
+   internal status to command_submitted.
+*/
+static void
+intel_gpgpu_event_flush(intel_event_t *event)
+{
+  assert(event->status == command_queued);
+  event->status = command_running;
+}
+
+static int
+intel_gpgpu_event_update_status(intel_event_t *event, int wait)
+{
+  if(event->status == command_complete)
+    return event->status;
+
+  if (event->buffer &&
+      event->status == command_running &&
+      !drm_intel_bo_busy(event->buffer)) {
+    event->status = command_complete;
+    drm_intel_bo_unreference(event->buffer);
+    event->buffer = NULL;
+    return event->status;
+  }
+
+  if(wait == 0)
+    return event->status;
+
+  if (event->buffer) {
+    drm_intel_bo_wait_rendering(event->buffer);
+    event->status = command_complete;
+    drm_intel_bo_unreference(event->buffer);
+    event->buffer = NULL;
+  }
+  return event->status;
+}
+
+static void
+intel_gpgpu_event_delete(intel_event_t *event)
+{
+  if(event->buffer)
+    drm_intel_bo_unreference(event->buffer);
+  if(event->ts_buf)
+    drm_intel_bo_unreference(event->ts_buf);
+  cl_free(event);
+}
+
+/* IVB and HSW's result MUST shift in x86_64 system */
+static uint64_t
+intel_gpgpu_read_ts_reg_gen7(drm_intel_bufmgr *bufmgr)
+{
+  uint64_t result = 0;
+  drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result);
+  /* In x86_64 system, the low 32bits of timestamp count are stored in the high 32 bits of
+     result which got from drm_intel_reg_read, and 32-35 bits are lost; but match bspec in
+     i386 system. It seems the kernel readq bug. So shift 32 bit in x86_64, and only remain
+     32 bits data in i386.
+  */
+#ifdef __i386__
+  return result & 0x0ffffffff;
+#else
+  return result >> 32;
+#endif  /* __i386__  */
+}
+
+/* baytrail's result should clear high 4 bits */
+static uint64_t
+intel_gpgpu_read_ts_reg_baytrail(drm_intel_bufmgr *bufmgr)
+{
+  uint64_t result = 0;
+  drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result);
+  return result & 0x0ffffffff;
+}
+
+/* We want to get the current time of GPU. */
+static void
+intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts)
+{
+  uint64_t result = 0;
+  drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+
+  /* Get the ts that match the bspec */
+  result = intel_gpgpu_read_ts_reg(bufmgr);
+  result *= 80;
+
+  *ret_ts = result;
+  return;
+}
+
+/* Get the GPU execute time. */
+static void
+intel_gpgpu_event_get_exec_timestamp(intel_gpgpu_t* gpgpu, intel_event_t *event,
+				     int index, uint64_t* ret_ts)
+{
+  uint64_t result = 0;
+
+  assert(event->ts_buf != NULL);
+  assert(index == 0 || index == 1);
+  drm_intel_gem_bo_map_gtt(event->ts_buf);
+  uint64_t* ptr = event->ts_buf->virtual;
+  result = ptr[index];
+
+  /* According to BSpec, the timestamp counter should be 36 bits,
+     but comparing to the timestamp counter from IO control reading,
+     we find the first 4 bits seems to be fake. In order to keep the
+     timestamp counter conformable, we just skip the first 4 bits.
+  */
+  result = (result & 0x0FFFFFFFF) * 80; //convert to nanoseconds
+  *ret_ts = result;
+
+  drm_intel_gem_bo_unmap_gtt(event->ts_buf);
+}
+
+static int
+intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i, uint32_t size, uint32_t offset, uint8_t bti)
+{
+  drm_intel_bo *bo = NULL;
+  if (i == 0) { // the index buffer.
+    if (gpgpu->printf_b.ibo)
+      dri_bo_unreference(gpgpu->printf_b.ibo);
+    gpgpu->printf_b.ibo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf index buffer", size, 4096);
+    bo = gpgpu->printf_b.ibo;
+  } else if (i == 1) {
+    if (gpgpu->printf_b.bo)
+      dri_bo_unreference(gpgpu->printf_b.bo);
+    gpgpu->printf_b.bo = dri_bo_alloc(gpgpu->drv->bufmgr, "Printf output buffer", size, 4096);
+    bo = gpgpu->printf_b.bo;
+  } else
+    assert(0);
+
+  if (!bo || (drm_intel_bo_map(bo, 1) != 0)) {
+    if (gpgpu->printf_b.bo)
+      drm_intel_bo_unreference(gpgpu->printf_b.bo);
+    gpgpu->printf_b.bo = NULL;
+    fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
+    return -1;
+  }
+  memset(bo->virtual, 0, size);
+  drm_intel_bo_unmap(bo);
+  intel_gpgpu_bind_buf(gpgpu, bo, offset, 0, size, bti);
+  return 0;
+}
+
+static void*
+intel_gpgpu_map_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i)
+{
+  drm_intel_bo *bo = NULL;
+  if (i == 0) {
+    bo = gpgpu->printf_b.ibo;
+  } else if (i == 1) {
+    bo = gpgpu->printf_b.bo;
+  } else
+    assert(0);
+
+  drm_intel_bo_map(bo, 1);
+  return bo->virtual;
+}
+
+static void
+intel_gpgpu_unmap_printf_buf_addr(intel_gpgpu_t *gpgpu, uint32_t i)
+{
+  drm_intel_bo *bo = NULL;
+  if (i == 0) {
+    bo = gpgpu->printf_b.ibo;
+  } else if (i == 1) {
+    bo = gpgpu->printf_b.bo;
+  } else
+  assert(0);
+
+  drm_intel_bo_unmap(bo);
+}
+
+static void
+intel_gpgpu_release_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i)
+{
+  if (i == 0) {
+    drm_intel_bo_unreference(gpgpu->printf_b.ibo);
+    gpgpu->printf_b.ibo = NULL;
+  } else if (i == 1) {
+    drm_intel_bo_unreference(gpgpu->printf_b.bo);
+    gpgpu->printf_b.bo = NULL;
+  } else
+    assert(0);
+}
+
+static void
+intel_gpgpu_set_printf_info(intel_gpgpu_t *gpgpu, void* printf_info, size_t * global_sz)
+{
+  gpgpu->printf_info = printf_info;
+  gpgpu->global_wk_sz[0] = global_sz[0];
+  gpgpu->global_wk_sz[1] = global_sz[1];
+  gpgpu->global_wk_sz[2] = global_sz[2];
+}
+
+static void*
+intel_gpgpu_get_printf_info(intel_gpgpu_t *gpgpu, size_t * global_sz)
+{
+  global_sz[0] = gpgpu->global_wk_sz[0];
+  global_sz[1] = gpgpu->global_wk_sz[1];
+  global_sz[2] = gpgpu->global_wk_sz[2];
+  return gpgpu->printf_info;
+}
+
+LOCAL void
+intel_set_gpgpu_callbacks(int device_id)
+{
+  cl_gpgpu_new = (cl_gpgpu_new_cb *) intel_gpgpu_new;
+  cl_gpgpu_delete = (cl_gpgpu_delete_cb *) intel_gpgpu_delete;
+  cl_gpgpu_sync = (cl_gpgpu_sync_cb *) intel_gpgpu_sync;
+  cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *) intel_gpgpu_bind_buf;
+  cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
+  cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
+  cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
+  cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes;
+  cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
+  cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
+  cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset;
+  cl_gpgpu_batch_start = (cl_gpgpu_batch_start_cb *) intel_gpgpu_batch_start;
+  cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end;
+  cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
+  cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
+  cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler;
+  cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
+  cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new;
+  cl_gpgpu_event_flush = (cl_gpgpu_event_flush_cb *)intel_gpgpu_event_flush;
+  cl_gpgpu_event_update_status = (cl_gpgpu_event_update_status_cb *)intel_gpgpu_event_update_status;
+  cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete;
+  cl_gpgpu_event_get_exec_timestamp = (cl_gpgpu_event_get_exec_timestamp_cb *)intel_gpgpu_event_get_exec_timestamp;
+  cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp;
+  cl_gpgpu_ref_batch_buf = (cl_gpgpu_ref_batch_buf_cb *)intel_gpgpu_ref_batch_buf;
+  cl_gpgpu_unref_batch_buf = (cl_gpgpu_unref_batch_buf_cb *)intel_gpgpu_unref_batch_buf;
+  cl_gpgpu_set_printf_buffer = (cl_gpgpu_set_printf_buffer_cb *)intel_gpgpu_set_printf_buf;
+  cl_gpgpu_map_printf_buffer = (cl_gpgpu_map_printf_buffer_cb *)intel_gpgpu_map_printf_buf;
+  cl_gpgpu_unmap_printf_buffer = (cl_gpgpu_unmap_printf_buffer_cb *)intel_gpgpu_unmap_printf_buf_addr;
+  cl_gpgpu_release_printf_buffer = (cl_gpgpu_release_printf_buffer_cb *)intel_gpgpu_release_printf_buf;
+  cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb *)intel_gpgpu_set_printf_info;
+  cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info;
+
+  if (IS_HASWELL(device_id)) {
+    cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75;
+    cl_gpgpu_alloc_constant_buffer  = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer_gen75;
+    intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen75;
+    cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen75;
+    intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen75;
+    intel_gpgpu_post_action = intel_gpgpu_post_action_gen75;
+    intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7; //HSW same as ivb
+  }
+  else if (IS_IVYBRIDGE(device_id)) {
+    cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
+    cl_gpgpu_alloc_constant_buffer  = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer_gen7;
+    if (IS_BAYTRAIL_T(device_id)) {
+      intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail;
+      intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
+    } else {
+      intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen7;
+      intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7;
+    }
+    cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen7;
+    intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen7;
+    intel_gpgpu_post_action = intel_gpgpu_post_action_gen7;
+  }
+}
diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h
new file mode 100644
index 0000000..d593ac7
--- /dev/null
+++ b/src/intel/intel_gpgpu.h
@@ -0,0 +1,34 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ *         Alexei Soupikov <alexei.soupikov at intel.com>
+ */
+
+#ifndef __INTEL_GPGPU_H__
+#define __INTEL_GPGPU_H__
+
+#include "cl_utils.h"
+#include "cl_driver.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+
+/* Set the gpgpu related call backs */
+extern void intel_set_gpgpu_callbacks(int device_id);
+
+#endif /* __INTEL_GPGPU_H__ */
+
diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h
new file mode 100644
index 0000000..ef76bb4
--- /dev/null
+++ b/src/intel/intel_structs.h
@@ -0,0 +1,461 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef __INTEL_STRUCTS_H__
+#define __INTEL_STRUCTS_H__
+
+#include <stdint.h>
+
+typedef struct gen6_interface_descriptor
+{
+  struct {
+    uint32_t pad6:6;
+    uint32_t kernel_start_pointer:26;
+  } desc0;
+
+  struct {
+    uint32_t pad:7;
+    uint32_t software_exception:1;
+    uint32_t pad2:3;
+    uint32_t maskstack_exception:1;
+    uint32_t pad3:1;
+    uint32_t illegal_opcode_exception:1;
+    uint32_t pad4:2;
+    uint32_t floating_point_mode:1;
+    uint32_t thread_priority:1;
+    uint32_t single_program_flow:1;
+    uint32_t pad5:1;
+    uint32_t pad6:6;
+    uint32_t pad7:6;
+  } desc1;
+
+  struct {
+    uint32_t pad:2;
+    uint32_t sampler_count:3;
+    uint32_t sampler_state_pointer:27;
+  } desc2;
+
+  struct {
+    uint32_t binding_table_entry_count:5;  /* prefetch entries only */
+    uint32_t binding_table_pointer:27;     /* 11 bit only on IVB+ */
+  } desc3;
+
+  struct {
+    uint32_t curbe_read_offset:16;         /* in GRFs */
+    uint32_t curbe_read_len:16;            /* in GRFs */
+  } desc4;
+
+  struct {
+    uint32_t group_threads_num:8;        /* 0..64, 0 - no barrier use */
+    uint32_t barrier_return_byte:8;
+    uint32_t slm_sz:5;                   /* 0..16 - 0K..64K */
+    uint32_t barrier_enable:1;
+    uint32_t rounding_mode:2;
+    uint32_t barrier_return_grf_offset:8;
+  } desc5;
+
+  uint32_t desc6; /* unused */
+  uint32_t desc7; /* unused */
+} gen6_interface_descriptor_t;
+
+typedef struct gen6_surface_state
+{
+  struct {
+    uint32_t cube_pos_z:1;
+    uint32_t cube_neg_z:1;
+    uint32_t cube_pos_y:1;
+    uint32_t cube_neg_y:1;
+    uint32_t cube_pos_x:1;
+    uint32_t cube_neg_x:1;
+    uint32_t pad:2;
+    uint32_t render_cache_read_mode:1;
+    uint32_t cube_map_corner_mode:1;
+    uint32_t mipmap_layout_mode:1;
+    uint32_t vert_line_stride_ofs:1;
+    uint32_t vert_line_stride:1;
+    uint32_t color_blend:1;
+    uint32_t writedisable_blue:1;
+    uint32_t writedisable_green:1;
+    uint32_t writedisable_red:1;
+    uint32_t writedisable_alpha:1;
+    uint32_t surface_format:9;
+    uint32_t data_return_format:1;
+    uint32_t pad0:1;
+    uint32_t surface_type:3;
+  } ss0;
+
+  struct {
+    uint32_t base_addr;
+  } ss1;
+
+  struct {
+    uint32_t render_target_rotation:2;
+    uint32_t mip_count:4;
+    uint32_t width:13;
+    uint32_t height:13;
+  } ss2;
+
+  struct {
+    uint32_t tile_walk:1;
+    uint32_t tiled_surface:1;
+    uint32_t pad:1;
+    uint32_t pitch:18;
+    uint32_t depth:11;
+  } ss3;
+
+  struct {
+    uint32_t multisample_pos_index:3;
+    uint32_t pad:1;
+    uint32_t multisample_count:3;
+    uint32_t pad1:1;
+    uint32_t rt_view_extent:9;
+    uint32_t min_array_elt:11;
+    uint32_t min_lod:4;
+  } ss4;
+
+  struct {
+    uint32_t pad:16;
+    uint32_t cache_control:2;  /* different values for GT and IVB */
+    uint32_t gfdt:1;           /* allows selective flushing of LLC (e.g. for scanout) */
+    uint32_t encrypted_data:1;
+    uint32_t y_offset:4;
+    uint32_t vertical_alignment:1;
+    uint32_t x_offset:7;
+  } ss5;
+
+  uint32_t ss6; /* unused */
+  uint32_t ss7; /* unused */
+} gen6_surface_state_t;
+
+typedef struct gen7_surface_state
+{
+  struct {
+    uint32_t cube_pos_z:1;
+    uint32_t cube_neg_z:1;
+    uint32_t cube_pos_y:1;
+    uint32_t cube_neg_y:1;
+    uint32_t cube_pos_x:1;
+    uint32_t cube_neg_x:1;
+    uint32_t media_boundary_pixel_mode:2;
+    uint32_t render_cache_rw_mode:1;
+    uint32_t pad1:1;
+    uint32_t surface_array_spacing:1;
+    uint32_t vertical_line_stride_offset:1;
+    uint32_t vertical_line_stride:1;
+    uint32_t tile_walk:1;
+    uint32_t tiled_surface:1;
+    uint32_t horizontal_alignment:1;
+    uint32_t vertical_alignment:2;
+    uint32_t surface_format:9;
+    uint32_t pad0:1;
+    uint32_t surface_array:1;
+    uint32_t surface_type:3;
+  } ss0;
+
+  struct {
+    uint32_t base_addr;
+  } ss1;
+
+  struct {
+    uint32_t width:14;
+    uint32_t pad1:2;
+    uint32_t height:14;
+    uint32_t pad0:2;
+  } ss2;
+
+  struct {
+    uint32_t pitch:18;
+    uint32_t pad0:3;
+    uint32_t depth:11;
+  } ss3;
+
+  union {
+    struct {
+      uint32_t mulsample_pal_idx:3;
+      uint32_t numer_mulsample:3;
+      uint32_t mss_fmt:1;
+      uint32_t rt_view_extent:11;
+      uint32_t min_array_element:11;
+      uint32_t rt_rotate:2;
+      uint32_t pad0:1;
+    } not_str_buf;
+  } ss4;
+
+  struct {
+    uint32_t mip_count:4;
+    uint32_t surface_min_load:4;
+    uint32_t pad2:6;
+    uint32_t coherence_type:1;
+    uint32_t stateless_force_write_thru:1;
+    uint32_t cache_control:4;
+    uint32_t y_offset:4;
+    uint32_t pad0:1;
+    uint32_t x_offset:7;
+  } ss5;
+
+  uint32_t ss6; /* unused */
+
+  struct {
+    uint32_t min_lod:12;
+    uint32_t pad0:4;
+    uint32_t shader_a:3;
+    uint32_t shader_b:3;
+    uint32_t shader_g:3;
+    uint32_t shader_r:3;
+    uint32_t pad1:4;
+  } ss7;
+} gen7_surface_state_t;
+
+STATIC_ASSERT(sizeof(gen6_surface_state_t) == sizeof(gen7_surface_state_t));
+static const size_t surface_state_sz = sizeof(gen6_surface_state_t);
+
+typedef struct gen6_vfe_state_inline
+{
+  struct {
+    uint32_t per_thread_scratch_space:4;
+    uint32_t pad3:3;
+    uint32_t extend_vfe_state_present:1;
+    uint32_t pad2:2;
+    uint32_t scratch_base:22;
+  } vfe0;
+
+  struct {
+    uint32_t debug_counter_control:2;
+    uint32_t gpgpu_mode:1;          /* 0 for SNB!!! */
+    uint32_t gateway_mmio_access:2;
+    uint32_t fast_preempt:1;
+    uint32_t bypass_gateway_ctl:1;  /* 0 - legacy, 1 - no open/close */
+    uint32_t reset_gateway_timer:1;
+    uint32_t urb_entries:8;
+    uint32_t max_threads:16;
+  } vfe1;
+
+  struct {
+    uint32_t pad8:8;
+    uint32_t debug_object_id:24;
+  } vfe2;
+
+  struct {
+    uint32_t curbe_size:16; /* in GRFs */
+    uint32_t urb_size:16;  /* in GRFs */
+  } vfe3;
+
+  struct {
+    uint32_t scoreboard_mask:32;  /* 1 - enable the corresponding dependency */
+  } vfe4;
+
+  struct {
+    uint32_t scoreboard0_dx:4;
+    uint32_t scoreboard0_dy:4;
+    uint32_t scoreboard1_dx:4;
+    uint32_t scoreboard1_dy:4;
+    uint32_t scoreboard2_dx:4;
+    uint32_t scoreboard2_dy:4;
+    uint32_t scoreboard3_dx:4;
+    uint32_t scoreboard3_dy:4;
+  } vfe5;
+
+  struct {
+    uint32_t scoreboard4_dx:4;
+    uint32_t scoreboard4_dy:4;
+    uint32_t scoreboard5_dx:4;
+    uint32_t scoreboard5_dy:4;
+    uint32_t scoreboard6_dx:4;
+    uint32_t scoreboard6_dy:4;
+    uint32_t scoreboard7_dx:4;
+    uint32_t scoreboard7_dy:4;
+  } vfe6;
+} gen6_vfe_state_inline_t;
+
+typedef struct gen6_pipe_control
+{
+  struct {
+    uint32_t length : BITFIELD_RANGE(0, 7);
+    uint32_t reserved : BITFIELD_RANGE(8, 15);
+    uint32_t instruction_subopcode : BITFIELD_RANGE(16, 23);
+    uint32_t instruction_opcode : BITFIELD_RANGE(24, 26);
+    uint32_t instruction_pipeline : BITFIELD_RANGE(27, 28);
+    uint32_t instruction_type : BITFIELD_RANGE(29, 31);
+  } dw0;
+
+  struct {
+    uint32_t depth_cache_flush_enable : BITFIELD_BIT(0);
+    uint32_t stall_at_pixel_scoreboard : BITFIELD_BIT(1);
+    uint32_t state_cache_invalidation_enable : BITFIELD_BIT(2);
+    uint32_t constant_cache_invalidation_enable : BITFIELD_BIT(3);
+    uint32_t vf_cache_invalidation_enable : BITFIELD_BIT(4);
+    uint32_t dc_flush_enable : BITFIELD_BIT(5);
+    uint32_t protected_memory_app_id : BITFIELD_BIT(6);
+    uint32_t pipe_control_flush_enable : BITFIELD_BIT(7);
+    uint32_t notify_enable : BITFIELD_BIT(8);
+    uint32_t indirect_state_pointers_disable : BITFIELD_BIT(9);
+    uint32_t texture_cache_invalidation_enable : BITFIELD_BIT(10);
+    uint32_t instruction_cache_invalidate_enable : BITFIELD_BIT(11);
+    uint32_t render_target_cache_flush_enable : BITFIELD_BIT(12);
+    uint32_t depth_stall_enable : BITFIELD_BIT(13);
+    uint32_t post_sync_operation : BITFIELD_RANGE(14, 15);
+    uint32_t generic_media_state_clear : BITFIELD_BIT(16);
+    uint32_t synchronize_gfdt_surface : BITFIELD_BIT(17);
+    uint32_t tlb_invalidate : BITFIELD_BIT(18);
+    uint32_t global_snapshot_count_reset : BITFIELD_BIT(19);
+    uint32_t cs_stall : BITFIELD_BIT(20);
+    uint32_t store_data_index : BITFIELD_BIT(21);
+    uint32_t protected_memory_enable : BITFIELD_BIT(22);
+    uint32_t reserved : BITFIELD_RANGE(23, 31);
+  } dw1;
+
+  struct {
+    uint32_t reserved : BITFIELD_RANGE(0, 1);
+    uint32_t destination_address_type : BITFIELD_BIT(2);
+    uint32_t address : BITFIELD_RANGE(3, 31);
+  } dw2;
+
+  struct {
+    uint32_t data;
+  } dw3;
+
+  struct {
+    uint32_t data;
+  } dw4;
+} gen6_pipe_control_t;
+
+typedef struct gen6_sampler_state
+{
+  struct {
+    uint32_t shadow_function:3; 
+    uint32_t lod_bias:11; 
+    uint32_t min_filter:3; 
+    uint32_t mag_filter:3; 
+    uint32_t mip_filter:2; 
+    uint32_t base_level:5; 
+    uint32_t min_mag_neq:1;
+    uint32_t lod_preclamp:1; 
+    uint32_t default_color_mode:1; 
+    uint32_t pad0:1;
+    uint32_t disable:1; 
+  } ss0;
+
+  struct {
+    uint32_t r_wrap_mode:3; 
+    uint32_t t_wrap_mode:3; 
+    uint32_t s_wrap_mode:3; 
+    uint32_t cube_control_mode:1;
+    uint32_t pad:2;
+    uint32_t max_lod:10; 
+    uint32_t min_lod:10; 
+  } ss1;
+
+  struct {
+    uint32_t pad:5;
+    uint32_t default_color_pointer:27; 
+  } ss2;
+
+  struct {
+    uint32_t non_normalized_coord:1;
+    uint32_t pad:12;
+    uint32_t address_round:6;
+    uint32_t max_aniso:3; 
+    uint32_t chroma_key_mode:1; 
+    uint32_t chroma_key_index:2; 
+    uint32_t chroma_key_enable:1; 
+    uint32_t monochrome_filter_width:3; 
+    uint32_t monochrome_filter_height:3; 
+  } ss3;
+} gen6_sampler_state_t;
+
+typedef struct gen7_sampler_border_color {
+    float r,g,b,a;
+} gen7_sampler_border_color_t;
+
+typedef struct gen7_sampler_state
+{
+  struct {
+    uint32_t aniso_algorithm:1;
+    uint32_t lod_bias:13;
+    uint32_t min_filter:3;
+    uint32_t mag_filter:3;
+    uint32_t mip_filter:2;
+    uint32_t base_level:5;
+    uint32_t pad1:1;
+    uint32_t lod_preclamp:1;
+    uint32_t default_color_mode:1;
+    uint32_t pad0:1;
+    uint32_t disable:1;
+  } ss0;
+
+  struct {
+    uint32_t cube_control_mode:1;
+    uint32_t shadow_function:3;
+    uint32_t pad:4;
+    uint32_t max_lod:12;
+    uint32_t min_lod:12;
+  } ss1;
+
+  struct {
+    uint32_t pad:5;
+    uint32_t default_color_pointer:27;
+  } ss2;
+
+  struct {
+    uint32_t r_wrap_mode:3;
+    uint32_t t_wrap_mode:3;
+    uint32_t s_wrap_mode:3;
+    uint32_t pad:1;
+    uint32_t non_normalized_coord:1;
+    uint32_t trilinear_quality:2;
+    uint32_t address_round:6;
+    uint32_t max_aniso:3;
+    uint32_t chroma_key_mode:1;
+    uint32_t chroma_key_index:2;
+    uint32_t chroma_key_enable:1;
+    uint32_t pad0:6;
+  } ss3;
+} gen7_sampler_state_t;
+
+STATIC_ASSERT(sizeof(gen6_sampler_state_t) == sizeof(gen7_sampler_state_t));
+
+#undef BITFIELD_BIT
+#undef BITFIELD_RANGE
+
+#endif /* __INTEL_STRUCTS_H__ */
+
diff --git a/src/kernels/cl_internal_copy_buf_align16.cl b/src/kernels/cl_internal_copy_buf_align16.cl
new file mode 100644
index 0000000..1abb4e9
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_align16.cl
@@ -0,0 +1,12 @@
+kernel void __cl_copy_region_align16 ( global float* src, unsigned int src_offset,
+                                      global float* dst, unsigned int dst_offset,
+				      unsigned int size)
+{
+    int i = get_global_id(0) * 4;
+    if (i < size*4) {
+        dst[i+dst_offset] = src[i+src_offset];
+        dst[i+dst_offset + 1] = src[i+src_offset + 1];
+        dst[i+dst_offset + 2] = src[i+src_offset + 2];
+        dst[i+dst_offset + 3] = src[i+src_offset + 3];
+    }
+}
diff --git a/src/kernels/cl_internal_copy_buf_align4.cl b/src/kernels/cl_internal_copy_buf_align4.cl
new file mode 100644
index 0000000..27174ca
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_align4.cl
@@ -0,0 +1,8 @@
+kernel void __cl_copy_region_align4 ( global float* src, unsigned int src_offset,
+                                     global float* dst, unsigned int dst_offset,
+				     unsigned int size)
+{
+    int i = get_global_id(0);
+    if (i < size)
+        dst[i+dst_offset] = src[i+src_offset];
+}
diff --git a/src/kernels/cl_internal_copy_buf_rect.cl b/src/kernels/cl_internal_copy_buf_rect.cl
new file mode 100644
index 0000000..71e7484
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_rect.cl
@@ -0,0 +1,15 @@
+kernel void __cl_copy_buffer_rect ( global char* src, global char* dst,
+                                          unsigned int region0, unsigned int region1, unsigned int region2,
+                                          unsigned int src_offset, unsigned int dst_offset,
+                                          unsigned int src_row_pitch, unsigned int src_slice_pitch,
+                                          unsigned int dst_row_pitch, unsigned int dst_slice_pitch)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_offset += k * src_slice_pitch + j * src_row_pitch + i;
+  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i;
+  dst[dst_offset] = src[src_offset];
+}
diff --git a/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
new file mode 100644
index 0000000..e02d0e5
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
@@ -0,0 +1,28 @@
+kernel void __cl_copy_region_unalign_dst_offset ( global int* src, unsigned int src_offset,
+                                     global int* dst, unsigned int dst_offset,
+				     unsigned int size,
+				     unsigned int first_mask, unsigned int last_mask,
+				     unsigned int shift, unsigned int dw_mask)
+{
+    int i = get_global_id(0);
+    unsigned int tmp = 0;
+
+    if (i > size -1)
+        return;
+
+    /* last dw, need to be careful, not to overflow the source. */
+    if ((i == size - 1) && ((last_mask & (~(~dw_mask >> shift))) == 0)) {
+        tmp = ((src[src_offset + i] & ~dw_mask) >> shift);
+    } else {
+        tmp = ((src[src_offset + i] & ~dw_mask) >> shift)
+             | ((src[src_offset + i + 1] & dw_mask) << (32 - shift));
+    }
+
+    if (i == 0) {
+        dst[dst_offset] = (dst[dst_offset] & first_mask) | (tmp & (~first_mask));
+    } else if (i == size - 1) {
+        dst[i+dst_offset] = (tmp & last_mask) | (dst[i+dst_offset] & (~last_mask));
+    } else {
+        dst[i+dst_offset] = tmp;
+    }
+}
diff --git a/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl
new file mode 100644
index 0000000..83b6e97
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_region_unalign_same_offset ( global int* src, unsigned int src_offset,
+                                     global int* dst, unsigned int dst_offset,
+				     unsigned int size,
+				     unsigned int first_mask, unsigned int last_mask)
+{
+    int i = get_global_id(0);
+    if (i > size -1)
+       return;
+
+    if (i == 0) {
+        dst[dst_offset] = (dst[dst_offset] & first_mask)
+             | (src[src_offset] & (~first_mask));
+    } else if (i == size - 1) {
+        dst[i+dst_offset] = (src[i+src_offset] & last_mask)
+            | (dst[i+dst_offset] & (~last_mask));
+    } else {
+        dst[i+dst_offset] = src[i+src_offset];
+    }
+}
diff --git a/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl
new file mode 100644
index 0000000..ce0aa1d
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl
@@ -0,0 +1,29 @@
+kernel void __cl_copy_region_unalign_src_offset ( global int* src, unsigned int src_offset,
+                                     global int* dst, unsigned int dst_offset,
+				     unsigned int size,
+				     unsigned int first_mask, unsigned int last_mask,
+				     unsigned int shift, unsigned int dw_mask, int src_less)
+{
+    int i = get_global_id(0);
+    unsigned int tmp = 0;
+
+    if (i > size -1)
+        return;
+
+    if (i == 0) {
+        tmp = ((src[src_offset + i] & dw_mask) << shift);
+    } else if (src_less && i == size - 1) { // not exceed the bound of source
+        tmp = ((src[src_offset + i - 1] & ~dw_mask) >> (32 - shift));
+    } else {
+        tmp = ((src[src_offset + i - 1] & ~dw_mask) >> (32 - shift))
+             | ((src[src_offset + i] & dw_mask) << shift);
+    }
+
+    if (i == 0) {
+        dst[dst_offset] = (dst[dst_offset] & first_mask) | (tmp & (~first_mask));
+    } else if (i == size - 1) {
+        dst[i+dst_offset] = (tmp & last_mask) | (dst[i+dst_offset] & (~last_mask));
+    } else {
+        dst[i+dst_offset] = tmp;
+    }
+}
diff --git a/src/kernels/cl_internal_copy_buffer_to_image_2d.cl b/src/kernels/cl_internal_copy_buffer_to_image_2d.cl
new file mode 100644
index 0000000..a218b58
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buffer_to_image_2d.cl
@@ -0,0 +1,18 @@
+kernel void __cl_copy_buffer_to_image_2d(__read_only image2d_t image, global uchar* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+                                        unsigned int src_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  uint4 color = (uint4)(0);
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  src_offset += (k * region1 + j) * region0 + i;
+  color.x = buffer[src_offset];
+  write_imageui(image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_buffer_to_image_3d.cl b/src/kernels/cl_internal_copy_buffer_to_image_3d.cl
new file mode 100644
index 0000000..84d3b27
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buffer_to_image_3d.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_buffer_to_image_3d(__read_only image3d_t image, global uchar* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+                                        unsigned int src_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  uint4 color = (uint4)(0);
+  int4 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  dst_coord.z = dst_origin2 + k;
+  src_offset += (k * region1 + j) * region0 + i;
+  color.x = buffer[src_offset];
+  write_imageui(image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_1d_to_1d.cl b/src/kernels/cl_internal_copy_image_1d_to_1d.cl
new file mode 100644
index 0000000..dca82b2
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_1d_to_1d.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_image_1d_to_1d(__read_only image1d_t src_image, __write_only image1d_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int src_coord;
+  int dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord = src_origin0 + i;
+  dst_coord = dst_origin0 + i;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_2d_to_2d.cl b/src/kernels/cl_internal_copy_image_2d_to_2d.cl
new file mode 100644
index 0000000..c5eaab1
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_2d_to_2d.cl
@@ -0,0 +1,21 @@
+kernel void __cl_copy_image_2d_to_2d(__read_only image2d_t src_image, __write_only image2d_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int2 src_coord;
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_2d_to_3d.cl b/src/kernels/cl_internal_copy_image_2d_to_3d.cl
new file mode 100644
index 0000000..4c73a74
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_2d_to_3d.cl
@@ -0,0 +1,22 @@
+kernel void __cl_copy_image_2d_to_3d(__read_only image2d_t src_image, __write_only image3d_t dst_image,
+                                         unsigned int region0, unsigned int region1, unsigned int region2,
+                                         unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                                         unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int2 src_coord;
+  int4 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  dst_coord.z = dst_origin2 + k;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_2d_to_buffer.cl b/src/kernels/cl_internal_copy_image_2d_to_buffer.cl
new file mode 100644
index 0000000..b6c352e
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_2d_to_buffer.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_image_2d_to_buffer( __read_only image2d_t image, global uchar* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                                        unsigned int dst_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  uint4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int2 src_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  color = read_imageui(image, sampler, src_coord);
+  dst_offset += (k * region1 + j) * region0 + i;
+  buffer[dst_offset] = color.x;
+}
diff --git a/src/kernels/cl_internal_copy_image_3d_to_2d.cl b/src/kernels/cl_internal_copy_image_3d_to_2d.cl
new file mode 100644
index 0000000..e0effa0
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_3d_to_2d.cl
@@ -0,0 +1,22 @@
+kernel void __cl_copy_image_3d_to_2d(__read_only image3d_t src_image, __write_only image2d_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int4 src_coord;
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  src_coord.z = src_origin2 + k;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_3d_to_3d.cl b/src/kernels/cl_internal_copy_image_3d_to_3d.cl
new file mode 100644
index 0000000..de80a0a
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_3d_to_3d.cl
@@ -0,0 +1,23 @@
+kernel void __cl_copy_image_3d_to_3d(__read_only image3d_t src_image, __write_only image3d_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int4 src_coord;
+  int4 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  src_coord.z = src_origin2 + k;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  dst_coord.z = dst_origin2 + k;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/src/kernels/cl_internal_copy_image_3d_to_buffer.cl b/src/kernels/cl_internal_copy_image_3d_to_buffer.cl
new file mode 100644
index 0000000..dcfc8a2
--- /dev/null
+++ b/src/kernels/cl_internal_copy_image_3d_to_buffer.cl
@@ -0,0 +1,22 @@
+#define IMAGE_TYPE image3d_t
+#define COORD_TYPE int4
+kernel void __cl_copy_image_3d_to_buffer ( __read_only IMAGE_TYPE image, global uchar* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                                        unsigned int dst_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  uint4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  COORD_TYPE src_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  src_coord.z = src_origin2 + k;
+  color = read_imageui(image, sampler, src_coord);
+  dst_offset += (k * region1 + j) * region0 + i;
+  buffer[dst_offset] = color.x;
+}
diff --git a/src/kernels/cl_internal_fill_buf_align128.cl b/src/kernels/cl_internal_fill_buf_align128.cl
new file mode 100644
index 0000000..552820c
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_align128.cl
@@ -0,0 +1,9 @@
+kernel void __cl_fill_region_align128 ( global float16* dst, float16 pattern0,
+                                        unsigned int offset, unsigned int size, float16 pattern1)
+{
+    int i = get_global_id(0);
+    if (i < size) {
+        dst[i*2+offset] = pattern0;
+        dst[i*2+offset+1] = pattern1;
+    }
+}
diff --git a/src/kernels/cl_internal_fill_buf_align2.cl b/src/kernels/cl_internal_fill_buf_align2.cl
new file mode 100644
index 0000000..0b9a4cf
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_align2.cl
@@ -0,0 +1,8 @@
+kernel void __cl_fill_region_align2 ( global char2 * dst, char2 pattern,
+			             unsigned int offset, unsigned int size)
+{
+    int i = get_global_id(0);
+    if (i < size) {
+        dst[i+offset] = pattern;
+    }
+}
diff --git a/src/kernels/cl_internal_fill_buf_align4.cl b/src/kernels/cl_internal_fill_buf_align4.cl
new file mode 100644
index 0000000..aefd92f
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_align4.cl
@@ -0,0 +1,8 @@
+kernel void __cl_fill_region_align4 ( global float* dst, float pattern,
+			             unsigned int offset, unsigned int size)
+{
+    int i = get_global_id(0);
+    if (i < size) {
+        dst[i+offset] = pattern;
+    }
+}
diff --git a/src/kernels/cl_internal_fill_buf_align8.cl b/src/kernels/cl_internal_fill_buf_align8.cl
new file mode 100644
index 0000000..edaff77
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_align8.cl
@@ -0,0 +1,14 @@
+#define COMPILER_ABS_FUNC_N(N) \
+    kernel void __cl_fill_region_align8_##N ( global float##N* dst, float##N pattern, \
+                                              unsigned int offset, unsigned int size) { \
+         int i = get_global_id(0); \
+         if (i < size) { \
+             dst[i+offset] = pattern; \
+         }  \
+    }
+
+
+COMPILER_ABS_FUNC_N(2)
+COMPILER_ABS_FUNC_N(4)
+COMPILER_ABS_FUNC_N(8)
+COMPILER_ABS_FUNC_N(16)
diff --git a/src/kernels/cl_internal_fill_buf_unalign.cl b/src/kernels/cl_internal_fill_buf_unalign.cl
new file mode 100644
index 0000000..90762b0
--- /dev/null
+++ b/src/kernels/cl_internal_fill_buf_unalign.cl
@@ -0,0 +1,8 @@
+kernel void __cl_fill_region_unalign ( global char * dst, char pattern,
+			               unsigned int offset, unsigned int size)
+{
+    int i = get_global_id(0);
+    if (i < size) {
+        dst[i+offset] = pattern;
+    }
+}
diff --git a/src/kernels/cl_internal_fill_image_1d.cl b/src/kernels/cl_internal_fill_image_1d.cl
new file mode 100644
index 0000000..b3b0cbf
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_1d.cl
@@ -0,0 +1,14 @@
+kernel void __cl_fill_image_1d( __write_only image1d_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord = origin0 + i;
+  write_imagef(image, coord, pattern);
+
+}
diff --git a/src/kernels/cl_internal_fill_image_1d_array.cl b/src/kernels/cl_internal_fill_image_1d_array.cl
new file mode 100644
index 0000000..f1eb241
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_1d_array.cl
@@ -0,0 +1,15 @@
+kernel void __cl_fill_image_1d_array( __write_only image1d_array_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int2 coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord.x = origin0 + i;
+  coord.y = origin2 + k;
+  write_imagef(image, coord, pattern);
+
+}
diff --git a/src/kernels/cl_internal_fill_image_2d.cl b/src/kernels/cl_internal_fill_image_2d.cl
new file mode 100644
index 0000000..0e29f3e
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_2d.cl
@@ -0,0 +1,15 @@
+kernel void __cl_fill_image_2d( __write_only image2d_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int2 coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord.x = origin0 + i;
+  coord.y = origin1 + j;
+  write_imagef(image, coord, pattern);
+
+}
diff --git a/src/kernels/cl_internal_fill_image_2d_array.cl b/src/kernels/cl_internal_fill_image_2d_array.cl
new file mode 100644
index 0000000..f29c9e7
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_2d_array.cl
@@ -0,0 +1,16 @@
+kernel void __cl_fill_image_2d_array( __write_only image2d_array_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord.x = origin0 + i;
+  coord.y = origin1 + j;
+  coord.z = origin2 + k;
+  write_imagef(image, coord, pattern);
+
+}
diff --git a/src/kernels/cl_internal_fill_image_3d.cl b/src/kernels/cl_internal_fill_image_3d.cl
new file mode 100644
index 0000000..042b8ab
--- /dev/null
+++ b/src/kernels/cl_internal_fill_image_3d.cl
@@ -0,0 +1,16 @@
+kernel void __cl_fill_image_3d( __write_only image3d_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord.x = origin0 + i;
+  coord.y = origin1 + j;
+  coord.z = origin2 + k;
+  write_imagef(image, coord, pattern);
+
+}
diff --git a/src/performance.c b/src/performance.c
new file mode 100644
index 0000000..85cd481
--- /dev/null
+++ b/src/performance.c
@@ -0,0 +1,324 @@
+#include <performance.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+#define MAX_KERNEL_NAME_LENGTH 100
+#define MAX_KERNEL_EXECUTION_COUNT 100000
+#define MAX_KERNEL_BUILD_OPT 1000
+typedef struct kernel_storage_node
+{
+  char kernel_name[MAX_KERNEL_NAME_LENGTH];
+  float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
+  char build_option[MAX_KERNEL_BUILD_OPT];
+  int current_count;
+  float kernel_sum_time;
+  struct kernel_storage_node *next;
+} kernel_storage_node;
+
+typedef struct context_storage_node
+{
+  uintptr_t context_id;
+  kernel_storage_node *kernels_storage;
+  char max_time_kernel_name[MAX_KERNEL_NAME_LENGTH];
+  float kernel_max_time;
+  int kernel_count;
+  struct context_storage_node *next;
+} context_storage_node;
+
+typedef struct storage
+{
+  context_storage_node * context_storage;
+} storage;
+
+
+
+static storage record;
+static int atexit_registered = 0;
+
+
+static context_storage_node * prev_context_pointer = NULL;
+static kernel_storage_node * prev_kernel_pointer = NULL;
+
+static context_storage_node * find_context(cl_context context)
+{
+  if(NULL != prev_context_pointer )
+  {
+    if(prev_context_pointer->context_id == (uintptr_t)context)
+      return prev_context_pointer;
+  }
+
+  if(NULL == record.context_storage)
+  {
+    record.context_storage = (context_storage_node *) malloc(sizeof(context_storage_node));
+    record.context_storage->context_id = (uintptr_t)context;
+    record.context_storage->kernels_storage = NULL;
+    record.context_storage->kernel_max_time = 0.0f;
+    record.context_storage->next = NULL;
+    record.context_storage->kernel_count = 0;
+    return record.context_storage;
+  }
+
+  context_storage_node *pre = record.context_storage;
+  context_storage_node *cur = record.context_storage;
+  while(NULL !=cur && (uintptr_t)context != cur->context_id )
+  {
+    pre = cur;
+    cur = cur->next;
+  }
+  if(NULL != cur)
+    return cur;
+
+  pre->next = (context_storage_node *)malloc(sizeof(context_storage_node));
+  pre = pre->next;
+  pre->context_id = (uintptr_t)context;
+  pre->kernels_storage = NULL;
+  pre->kernel_max_time = 0.0f;
+  pre->next = NULL;
+  pre->kernel_count = 0;
+  return pre;
+}
+
+static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name, const char *build_opt)
+{
+  if(NULL != prev_kernel_pointer && NULL != prev_context_pointer &&
+     p_context == prev_context_pointer &&
+     !strncmp(kernel_name, prev_kernel_pointer->kernel_name, MAX_KERNEL_NAME_LENGTH) &&
+     !strncmp(build_opt, prev_kernel_pointer->build_option, MAX_KERNEL_BUILD_OPT))
+    return prev_kernel_pointer;
+
+  if(NULL == p_context)
+    return NULL;
+
+  if(NULL == p_context->kernels_storage)
+  {
+    p_context->kernels_storage = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
+    p_context->kernel_count++;
+    strncpy(p_context->kernels_storage->kernel_name,kernel_name, MAX_KERNEL_NAME_LENGTH);
+    p_context->kernels_storage->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+    strncpy(p_context->kernels_storage->build_option, build_opt, MAX_KERNEL_BUILD_OPT);
+    p_context->kernels_storage->build_option[MAX_KERNEL_BUILD_OPT - 1] = '\0';
+    p_context->kernels_storage->current_count = 0;
+    p_context->kernels_storage->kernel_sum_time = 0.0f;
+    p_context->kernels_storage->next = NULL;
+    return p_context->kernels_storage;
+  }
+
+  kernel_storage_node *pre = p_context->kernels_storage;
+  kernel_storage_node *cur = p_context->kernels_storage;
+  while(NULL != cur &&
+        (strncmp(cur->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH) ||
+         strncmp(cur->build_option, build_opt, MAX_KERNEL_BUILD_OPT)))
+  {
+    pre = cur;
+    cur = cur->next;
+  }
+  if(NULL != cur)
+    return cur;
+
+  p_context->kernel_count++;
+  pre->next = (kernel_storage_node *)malloc(sizeof(kernel_storage_node));
+  pre = pre->next;
+  pre->current_count = 0;
+  pre->kernel_sum_time = 0.0f;
+  pre->next = NULL;
+  strncpy(pre->kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH);
+  pre->kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+  strncpy(pre->build_option, build_opt, MAX_KERNEL_BUILD_OPT);
+  pre->build_option[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+  return pre;
+}
+
+static void free_storage()
+{
+  context_storage_node *p_context = record.context_storage;
+  while(NULL != p_context)
+  {
+    context_storage_node *p_tmp_context = p_context->next;
+    kernel_storage_node *p_kernel = p_context->kernels_storage;
+    while(NULL != p_kernel)
+    {
+      kernel_storage_node *p_tmp_kernel = p_kernel->next;
+      free(p_kernel);
+      p_kernel = p_tmp_kernel;
+    }
+    free(p_context);
+    p_context = p_tmp_context;
+  }
+}
+
+typedef struct time_element
+{
+  char kernel_name[MAX_KERNEL_NAME_LENGTH];
+  float kernel_sum_time;
+  int kernel_execute_count;
+  double dev;
+  float kernel_times[MAX_KERNEL_EXECUTION_COUNT];
+  uint32_t time_index;
+} time_element;
+
+static int cmp(const void *a, const void *b)
+{
+  if(((time_element *)a)->kernel_sum_time < ((time_element *)b)->kernel_sum_time)
+    return 1;
+  else if(((time_element *)a)->kernel_sum_time > ((time_element *)b)->kernel_sum_time)
+    return -1;
+  else
+    return 0;
+}
+
+static void print_time_info()
+{
+  context_storage_node *p_context = record.context_storage;
+  if(NULL == p_context)
+  {
+    printf("Nothing to output !\n");
+    return;
+  }
+
+  int tmp_context_id = 0;
+  while(NULL != p_context)
+  {
+    printf("[------------ CONTEXT %4d ------------]\n", tmp_context_id++);
+    printf("  ->>>> KERNELS TIME SUMMARY <<<<-\n");
+
+    kernel_storage_node *p_kernel = p_context->kernels_storage;
+    kernel_storage_node *p_tmp_kernel = p_kernel;
+    time_element *te = (time_element *)malloc(sizeof(time_element)*p_context->kernel_count);
+    memset(te, 0, sizeof(time_element)*p_context->kernel_count);
+    int i = -1, j = 0, k = 0;
+    while(NULL != p_tmp_kernel)
+    {
+      for(k=0; k<=i; k++)
+      {
+        if(!strncmp(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH))
+          break;
+      }
+      if(k == i+1)
+      {
+        i++;
+        k = i;
+      }
+      te[k].kernel_execute_count += p_tmp_kernel->current_count;
+      strncpy(te[k].kernel_name, p_tmp_kernel->kernel_name, MAX_KERNEL_NAME_LENGTH);
+      te[k].kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+      te[k].kernel_sum_time += p_tmp_kernel->kernel_sum_time;
+      for(j=0; j != p_tmp_kernel->current_count; ++j)
+        te[k].kernel_times[te[k].time_index++] = p_tmp_kernel->kernel_times[j];
+      p_tmp_kernel = p_tmp_kernel->next;
+    }
+
+    for(k=0; k<=i; k++)
+    {
+      float average = te[k].kernel_sum_time / te[k].kernel_execute_count;
+      double sumsquare = 0.0;
+      for(j=0; j<te[k].time_index; ++j)
+        sumsquare += pow((te[k].kernel_times[j] - average), 2.0);
+      te[k].dev = sqrt(sumsquare / te[k].kernel_execute_count);
+    }
+
+    float sum_time = 0.0f;
+    qsort((void *)te, p_context->kernel_count, sizeof(time_element), cmp);
+    for(j=0; j<=i; ++j)
+      sum_time += te[j].kernel_sum_time;
+
+    for(j=0; j<=i; ++j)
+    {
+      printf("    [Kernel Name: %-30s Time(ms): (%4.1f%%) %9.2f  Count: %-7d  Ave(ms): %7.2f  Dev: %.1lf%%]\n",
+             te[j].kernel_name,
+             te[j].kernel_sum_time / sum_time * 100,
+             te[j].kernel_sum_time,
+             te[j].kernel_execute_count,
+             te[j].kernel_sum_time / te[j].kernel_execute_count,
+             te[j].dev / te[j].kernel_sum_time * te[j].kernel_execute_count * 100);
+    }
+    free(te);
+    printf("    Total : %.2f\n", sum_time);
+    if(2 != b_output_kernel_perf)
+    {
+      printf("[------------  CONTEXT ENDS------------]\n\n");
+      p_context = p_context->next;
+      continue;
+    }
+    p_tmp_kernel = p_kernel;
+    printf("\n  ->>>> KERNELS TIME DETAIL <<<<-\n");
+    while(NULL != p_kernel)
+    {
+      printf("    [Kernel Name : %30s   Time(ms): %.2f]\n", p_kernel->kernel_name, p_kernel->kernel_sum_time);
+      if(*p_kernel->build_option != '\0')
+      {
+        int count = 0;
+        printf("      ->Build Options : ");
+        while(p_kernel->build_option[count] != '\0' )
+        {
+          printf("%c", p_kernel->build_option[count++]);
+          if(count % 100 == 0)
+            printf("\n                         ");
+        }
+        printf("\n");
+      }
+      for(i=0; i!=p_kernel->current_count; ++i)
+        printf("      Execution Round%5d : %.2f (ms)\n", i+1, p_kernel->kernel_times[i]);
+      p_kernel = p_kernel->next;
+    }
+    printf("[------------  CONTEXT ENDS------------]\n\n");
+    p_context = p_context->next;
+  }
+  free_storage();
+}
+
+
+static void insert(cl_context context, const char *kernel_name, const char *build_opt, float time)
+{
+  if(!atexit_registered)
+  {
+    atexit_registered = 1;
+    atexit(print_time_info);
+  }
+  context_storage_node *p_context = find_context(context);
+  kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name, build_opt);
+  prev_context_pointer = p_context;
+  prev_kernel_pointer = p_kernel;
+  p_kernel->kernel_times[p_kernel->current_count++] = time;
+  p_kernel->kernel_sum_time += time;
+  if(p_kernel->kernel_sum_time > p_context->kernel_max_time)
+  {
+    p_context->kernel_max_time = p_kernel->kernel_sum_time;
+    strncpy(p_context->max_time_kernel_name, kernel_name, MAX_KERNEL_NAME_LENGTH);
+    p_context->max_time_kernel_name[MAX_KERNEL_NAME_LENGTH - 1] = '\0';
+  }
+}
+
+
+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+int b_output_kernel_perf = 0;
+static struct timeval start, end;
+
+void initialize_env_var()
+{
+  char *env = getenv("OCL_OUTPUT_KERNEL_PERF");
+  if(NULL == env || !strncmp(env,"0", 1))
+    b_output_kernel_perf = 0;
+  else if(!strncmp(env,"1", 1))
+    b_output_kernel_perf = 1;
+  else
+    b_output_kernel_perf = 2;
+}
+
+void time_start(cl_context context, const char * kernel_name, cl_command_queue cq)
+{
+  pthread_mutex_lock(&mutex);
+  gettimeofday(&start, NULL);
+}
+
+void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq)
+{
+  clFinish(cq);
+  gettimeofday(&end, NULL);
+  float t = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - start.tv_usec)/1000.0f;
+  insert(context, kernel_name, build_opt, t);
+  pthread_mutex_unlock(&mutex);
+}
diff --git a/src/performance.h b/src/performance.h
new file mode 100644
index 0000000..1e75054
--- /dev/null
+++ b/src/performance.h
@@ -0,0 +1,12 @@
+#ifndef __PERFORMANCE_H__
+#define __PERFORMANCE_H__
+#include "CL/cl.h"
+
+
+extern int b_output_kernel_perf;
+void time_start(cl_context context, const char * kernel_name, cl_command_queue cq);
+void time_end(cl_context context, const char * kernel_name, const char * build_opt, cl_command_queue cq);
+void initialize_env_var();
+
+
+#endif
diff --git a/src/x11/dricommon.c b/src/x11/dricommon.c
new file mode 100644
index 0000000..bd4ac50
--- /dev/null
+++ b/src/x11/dricommon.c
@@ -0,0 +1,330 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ * Note: the code is taken from libva code base
+ */
+
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <X11/Xlibint.h>
+#include <X11/Xlib.h>
+#include "x11/va_dri2.h"
+#include "x11/va_dri2tokens.h"
+#include "x11/dricommon.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+
+#define LOCAL __attribute__ ((visibility ("internal")))
+
+LOCAL dri_drawable_t*
+dri_state_do_drawable_hash(dri_state_t *state, XID drawable)
+{
+  int index = drawable % DRAWABLE_HASH_SZ;
+  struct dri_drawable *dri_drawable = state->drawable_hash[index];
+
+  while (dri_drawable) {
+    if (dri_drawable->x_drawable == drawable)
+      return dri_drawable;
+    dri_drawable = dri_drawable->next;
+  }
+
+  dri_drawable = dri_state_create_drawable(state, drawable);
+  dri_drawable->x_drawable = drawable;
+  dri_drawable->next = state->drawable_hash[index];
+  state->drawable_hash[index] = dri_drawable;
+
+  return dri_drawable;
+}
+
+LOCAL void
+dri_state_free_drawable_hash(dri_state_t *state)
+{
+  int i;
+  struct dri_drawable *dri_drawable, *prev;
+
+  for (i = 0; i < DRAWABLE_HASH_SZ; i++) {
+    dri_drawable = state->drawable_hash[i];
+
+    while (dri_drawable) {
+      prev = dri_drawable;
+      dri_drawable = prev->next;
+      dri_state_destroy_drawable(state, prev);
+    }
+  }
+}
+
+LOCAL dri_drawable_t*
+dri_state_get_drawable(dri_state_t *state, XID drawable)
+{
+  return dri_state_do_drawable_hash(state, drawable);
+}
+
+LOCAL void
+dri_state_init_drawable_hash_table(dri_state_t *state)
+{
+  int i;
+  for(i=0; i < DRAWABLE_HASH_SZ; i++)
+    state->drawable_hash[i] = NULL;
+}
+
+LOCAL void
+dri_state_delete(dri_state_t *state)
+{
+  if (state == NULL)
+    return;
+  dri_state_close(state);
+  cl_free(state);
+}
+
+LOCAL dri_state_t*
+dri_state_new(void)
+{
+  dri_state_t *state = NULL;
+  TRY_ALLOC_NO_ERR (state, CALLOC(dri_state_t));
+  state->fd = -1;
+  state->driConnectedFlag = NONE;
+  dri_state_init_drawable_hash_table(state);
+
+exit:
+  return state;
+error:
+  dri_state_delete(state);
+  state = NULL;
+  goto exit;
+}
+
+#define __DRI_BUFFER_FRONT_LEFT         0
+#define __DRI_BUFFER_BACK_LEFT          1
+#define __DRI_BUFFER_FRONT_RIGHT        2
+#define __DRI_BUFFER_BACK_RIGHT         3
+#define __DRI_BUFFER_DEPTH              4
+#define __DRI_BUFFER_STENCIL            5
+#define __DRI_BUFFER_ACCUM              6
+#define __DRI_BUFFER_FAKE_FRONT_LEFT    7
+#define __DRI_BUFFER_FAKE_FRONT_RIGHT   8
+
+typedef struct dri2_drawable
+{
+  struct dri_drawable base;
+  union dri_buffer buffers[5];
+  int width;
+  int height;
+  int has_backbuffer;
+  int back_index;
+  int front_index;
+} dri2_drawable_t;
+
+LOCAL dri_drawable_t*
+dri_state_create_drawable(dri_state_t *state, XID x_drawable)
+{
+  dri2_drawable_t *dri2_drwble;
+  dri2_drwble = (dri2_drawable_t*)calloc(1, sizeof(*dri2_drwble));
+
+  if (!dri2_drwble)
+    return NULL;
+
+  dri2_drwble->base.x_drawable = x_drawable;
+  dri2_drwble->base.x = 0;
+  dri2_drwble->base.y = 0;
+  VA_DRI2CreateDrawable(state->x11_dpy, x_drawable);
+
+  return &dri2_drwble->base;
+}
+
+LOCAL void
+dri_state_destroy_drawable(dri_state_t *state, dri_drawable_t *dri_drwble)
+{
+  VA_DRI2DestroyDrawable(state->x11_dpy, dri_drwble->x_drawable);
+  free(dri_drwble);
+}
+
+LOCAL void
+dri_state_swap_buffer(dri_state_t *state, dri_drawable_t *dri_drwble)
+{
+  dri2_drawable_t *dri2_drwble = (dri2_drawable_t*)dri_drwble;
+  XRectangle xrect;
+  XserverRegion region;
+
+  if (dri2_drwble->has_backbuffer) {
+    xrect.x = 0;
+    xrect.y = 0;
+    xrect.width = dri2_drwble->width;
+    xrect.height = dri2_drwble->height;
+
+    region = XFixesCreateRegion(state->x11_dpy, &xrect, 1);
+    VA_DRI2CopyRegion(state->x11_dpy, dri_drwble->x_drawable, region,
+        DRI2BufferFrontLeft, DRI2BufferBackLeft);
+    XFixesDestroyRegion(state->x11_dpy, region);
+  }
+}
+
+LOCAL union dri_buffer*
+dri_state_get_rendering_buffer(dri_state_t *state, dri_drawable_t *dri_drwble)
+{
+  dri2_drawable_t *dri2_drwble = (dri2_drawable_t *)dri_drwble;
+  int i;
+  int count;
+  unsigned int attachments[5];
+  VA_DRI2Buffer *buffers;
+
+  i = 0;
+  attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+  attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+  buffers = VA_DRI2GetBuffers(state->x11_dpy,
+                              dri_drwble->x_drawable,
+                              &dri2_drwble->width,
+                              &dri2_drwble->height,
+                              attachments,
+                              i,
+                              &count);
+  assert(buffers);
+  if (buffers == NULL)
+    return NULL;
+
+  dri2_drwble->has_backbuffer = 0;
+
+  for (i = 0; i < count; i++) {
+    dri2_drwble->buffers[i].dri2.attachment = buffers[i].attachment;
+    dri2_drwble->buffers[i].dri2.name = buffers[i].name;
+    dri2_drwble->buffers[i].dri2.pitch = buffers[i].pitch;
+    dri2_drwble->buffers[i].dri2.cpp = buffers[i].cpp;
+    dri2_drwble->buffers[i].dri2.flags = buffers[i].flags;
+
+    if (buffers[i].attachment == __DRI_BUFFER_BACK_LEFT) {
+      dri2_drwble->has_backbuffer = 1;
+      dri2_drwble->back_index = i;
+    }
+
+    if (buffers[i].attachment == __DRI_BUFFER_FRONT_LEFT)
+      dri2_drwble->front_index = i;
+  }
+
+  dri_drwble->width = dri2_drwble->width;
+  dri_drwble->height = dri2_drwble->height;
+  Xfree(buffers);
+
+  if (dri2_drwble->has_backbuffer)
+    return &dri2_drwble->buffers[dri2_drwble->back_index];
+
+  return &dri2_drwble->buffers[dri2_drwble->front_index];
+}
+
+LOCAL void
+dri_state_close(dri_state_t *state) {
+  dri_state_free_drawable_hash(state);
+  assert(state->fd >= 0);
+  close(state->fd);
+}
+
+LOCAL void
+dri_state_release(dri_state_t *state) {
+  dri_state_delete(state);
+}
+
+LOCAL dri_state_t*
+getDRI2State(Display* dpy, int screen, char **driver_name)
+{
+  int major, minor;
+  int error_base;
+  int event_base;
+  char *device_name = NULL;
+  drm_magic_t magic;
+  char * internal_driver_name = NULL;
+  int fd = -1;
+  dri_state_t* state = NULL;
+
+  if (!VA_DRI2QueryExtension(dpy, &event_base, &error_base))
+    goto err_out;
+
+  if (!VA_DRI2QueryVersion(dpy, &major, &minor))
+    goto err_out;
+
+
+  if (!VA_DRI2Connect(dpy, RootWindow(dpy, screen),
+        &internal_driver_name, &device_name))
+    goto err_out;
+
+  fd = open(device_name, O_RDWR);
+  assert(fd >= 0);
+
+  if (fd < 0)
+    goto err_out;
+
+  if (drmGetMagic(fd, &magic))
+    goto err_out;
+
+  if (!VA_DRI2Authenticate(dpy, RootWindow(dpy, screen),
+        magic))
+    goto err_out;
+
+  if(driver_name)
+    *driver_name = internal_driver_name;
+  else
+    Xfree(internal_driver_name);
+
+  state = dri_state_new();
+  state->fd = fd;
+  state->x11_dpy = dpy;
+  state->x11_screen = screen;
+  state->driConnectedFlag = DRI2;
+  if (device_name)
+    Xfree(device_name);
+  return state;
+
+err_out:
+  if (device_name)
+    Xfree(device_name);
+
+  if (internal_driver_name)
+    Xfree(internal_driver_name);
+
+  if(driver_name) *driver_name = NULL;
+
+  if (fd >= 0)
+    close(fd);
+
+  if (driver_name)
+    *driver_name = NULL;
+
+  return state;
+}
+
diff --git a/src/x11/dricommon.h b/src/x11/dricommon.h
new file mode 100644
index 0000000..5a950b4
--- /dev/null
+++ b/src/x11/dricommon.h
@@ -0,0 +1,99 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ * Note: the code is taken from libva code base
+ */
+
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _VA_DRICOMMON_H_
+#define _VA_DRICOMMON_H_
+
+#include <X11/Xlib.h>
+#include <xf86drm.h>
+#include <drm.h>
+#include <drm_sarea.h>
+
+union dri_buffer 
+{
+  struct {
+    unsigned int attachment;
+    unsigned int name;
+    unsigned int pitch;
+    unsigned int cpp;
+    unsigned int flags;
+  } dri2;
+};
+
+typedef struct dri_drawable 
+{
+  XID x_drawable;
+  int x;
+  int y;
+  unsigned int width;
+  unsigned int height;
+  struct dri_drawable *next;
+} dri_drawable_t;
+
+#define DRAWABLE_HASH_SZ 32
+
+enum DRI_VER
+{
+  NONE = 0,
+  // NOT supported VA_DRI1 = 1,
+  DRI2 = 2
+};
+
+typedef struct dri_state
+{
+  Display *x11_dpy;
+  int x11_screen;
+  int fd;
+  enum DRI_VER driConnectedFlag; /* 0: disconnected, 2: DRI2 */
+  dri_drawable_t *drawable_hash[DRAWABLE_HASH_SZ];
+} dri_state_t;
+
+dri_drawable_t *dri_state_create_drawable(dri_state_t*, XID x_drawable);
+void dri_state_destroy_drawable(dri_state_t*, dri_drawable_t*);
+void dri_state_close(dri_state_t*);
+void dri_state_release(dri_state_t*);
+
+// Create a dri2 state from dpy and screen
+dri_state_t *getDRI2State(Display* dpy, int screen, char **driver_name);
+
+#endif /* _VA_DRICOMMON_H_ */
+
diff --git a/src/x11/mesa_egl_extension.c b/src/x11/mesa_egl_extension.c
new file mode 100644
index 0000000..a7fc8cb
--- /dev/null
+++ b/src/x11/mesa_egl_extension.c
@@ -0,0 +1,307 @@
+#include <stdio.h>
+#include "mesa_egl_extension.h"
+#include "mesa_egl_res_share.h"
+#include "src/cl_driver.h"
+
+struct _egl_display;
+struct _egl_resource;
+struct _egl_thread_info;
+struct _egl_config;
+struct _egl_surface;
+struct _egl_driver;
+
+typedef struct _egl_display _EGLDisplay;
+typedef struct _egl_resource _EGLResource;
+typedef struct _egl_thread_info _EGLThreadInfo;
+typedef struct _egl_config _EGLConfig;
+typedef struct _egl_surface _EGLSurface;
+typedef struct _egl_driver _EGLDriver;
+
+/**
+ * A resource of a display.
+ */
+struct _egl_resource
+{
+   /* which display the resource belongs to */
+   _EGLDisplay *Display;
+   EGLBoolean IsLinked;
+   EGLint RefCount;
+
+   /* used to link resources of the same type */
+   _EGLResource *Next;
+};
+
+/**
+ * "Base" class for device driver contexts.
+ */
+struct _egl_context
+{
+   /* A context is a display resource */
+   _EGLResource Resource;
+
+   /* The bound status of the context */
+   _EGLThreadInfo *Binding;
+   _EGLSurface *DrawSurface;
+   _EGLSurface *ReadSurface;
+
+   _EGLConfig *Config;
+
+   EGLint ClientAPI; /**< EGL_OPENGL_ES_API, EGL_OPENGL_API, EGL_OPENVG_API */
+   EGLint ClientMajorVersion;
+   EGLint ClientMinorVersion;
+   EGLint Flags;
+   EGLint Profile;
+   EGLint ResetNotificationStrategy;
+
+   /* The real render buffer when a window surface is bound */
+   EGLint WindowRenderBuffer;
+};
+
+typedef struct _egl_context _EGLContext;
+
+struct dri2_egl_display
+{
+   int                       dri2_major;
+   int                       dri2_minor;
+   __DRIscreen              *dri_screen;
+   int                       own_dri_screen;
+   const __DRIconfig       **driver_configs;
+   void                     *driver;
+};
+
+enum _egl_platform_type {
+   _EGL_PLATFORM_WINDOWS,
+   _EGL_PLATFORM_X11,
+   _EGL_PLATFORM_WAYLAND,
+   _EGL_PLATFORM_DRM,
+   _EGL_PLATFORM_FBDEV,
+   _EGL_PLATFORM_NULL,
+   _EGL_PLATFORM_ANDROID,
+
+   _EGL_NUM_PLATFORMS,
+   _EGL_INVALID_PLATFORM = -1
+};
+typedef enum _egl_platform_type _EGLPlatformType;
+
+typedef pthread_mutex_t _EGLMutex;
+
+struct _egl_display
+{
+   /* used to link displays */
+   _EGLDisplay *Next;
+
+   _EGLMutex Mutex;
+
+   _EGLPlatformType Platform; /**< The type of the platform display */
+   void *PlatformDisplay;     /**< A pointer to the platform display */
+
+   _EGLDriver *Driver;        /**< Matched driver of the display */
+   EGLBoolean Initialized;    /**< True if the display is initialized */
+
+   /* options that affect how the driver initializes the display */
+   struct {
+      EGLBoolean TestOnly;    /**< Driver should not set fields when true */
+      EGLBoolean UseFallback; /**< Use fallback driver (sw or less features) */
+   } Options;
+
+   /* these fields are set by the driver during init */
+   void *DriverData;          /**< Driver private data */
+};
+
+static struct dri2_egl_display *
+dri2_egl_display(_EGLDisplay *dpy)
+{
+  return (struct dri2_egl_display *)dpy->DriverData;
+}
+
+static _EGLDisplay *
+_eglLockDisplay(EGLDisplay dpy)
+{
+  return (_EGLDisplay *)dpy;
+}
+
+static _EGLContext *
+_eglLookupContext(EGLContext ctx, EGLDisplay disp)
+{
+  disp = disp;
+  return (_EGLContext *) ctx;
+}
+
+struct dri2_egl_context
+{
+   _EGLContext   base;
+   __DRIcontext *dri_context;
+};
+
+static struct dri2_egl_context *
+dri2_egl_context(_EGLContext *ctx)
+{
+  return (struct dri2_egl_context *)ctx;
+}
+
+static EGLBoolean
+dri2_acquire_texture(_EGLDisplay *disp,
+                     _EGLContext *ctx,
+                     const EGLint *attr_list,
+                     void *user_data)
+{
+   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   GLuint texture = 0;
+   GLenum gl_target = 0;
+   GLint level = 0;
+   GLboolean ret;
+
+   if (_eglParseTextureAttribList(&texture, &gl_target, &level, attr_list) != EGL_SUCCESS)
+      return EGL_FALSE;
+
+   ret = cl_gl_acquire_texture(dri2_dpy->driver,
+                               dri2_ctx->dri_context,
+                               gl_target, level, texture,
+                               user_data);
+   return ret;
+}
+
+static EGLBoolean
+dri2_release_texture(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
+{
+   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   GLuint texture = 0;
+   GLenum gl_target = 0;
+   GLint level = 0;
+   GLboolean ret;
+
+   if (_eglParseTextureAttribList(&texture, &gl_target, &level, attr_list) != EGL_SUCCESS)
+      return EGL_FALSE;
+
+   ret = cl_gl_release_texture(dri2_dpy->driver, dri2_ctx->dri_context,
+                               gl_target, level, texture);
+   return ret;
+}
+
+static EGLBoolean
+dri2_acquire_buffer_object(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list,
+                           void *user_data)
+{
+   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   GLuint bufobj = 0;
+   GLboolean ret;
+
+   if (_eglParseBufferObjAttribList(&bufobj, attr_list) != EGL_SUCCESS)
+      return EGL_FALSE;
+
+   ret = cl_gl_acquire_buffer_object(dri2_dpy->driver,
+                                     dri2_ctx->dri_context,
+                                     bufobj, user_data);
+   return ret;
+}
+
+static EGLBoolean
+dri2_release_buffer_object(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
+{
+   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   GLuint bufobj = 0;
+   GLboolean ret;
+
+   if (_eglParseBufferObjAttribList(&bufobj, attr_list) != EGL_SUCCESS)
+      return EGL_FALSE;
+
+   ret = cl_gl_release_buffer_object(dri2_dpy->driver,
+                                     dri2_ctx->dri_context,
+                                     bufobj);
+   return ret;
+}
+
+static EGLBoolean
+dri2_acquire_render_buffer(_EGLDisplay *disp,
+                           _EGLContext *ctx,
+                           const EGLint *attr_list,
+                           void *user_data)
+{
+   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   GLuint rb = 0;
+   GLboolean ret;
+
+   if (_eglParseBufferObjAttribList(&rb, attr_list) != EGL_SUCCESS)
+      return EGL_FALSE;
+
+   ret = cl_gl_acquire_render_buffer(dri2_dpy->driver,
+                                     dri2_ctx->dri_context,
+                                     rb, user_data);
+   return ret;
+}
+
+static EGLBoolean
+dri2_release_render_buffer(_EGLDisplay *disp, _EGLContext *ctx, const EGLint *attr_list)
+{
+   struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   GLuint rb = 0;
+   GLboolean ret;
+
+   if (_eglParseBufferObjAttribList(&rb, attr_list) != EGL_SUCCESS)
+      return EGL_FALSE;
+
+   ret = cl_gl_release_render_buffer(dri2_dpy->driver,
+                                     dri2_ctx->dri_context,
+                                     rb);
+   return ret;
+}
+
+static EGLBoolean
+dri2_acquire_resource_mesa(_EGLDisplay *disp, _EGLContext *ctx, const EGLenum target,
+                           const EGLint *attrib_list, void *user_data)
+{
+   switch (target) {
+   case EGL_GL_TEXTURE_MESA:
+     return dri2_acquire_texture(disp, ctx, attrib_list, user_data);
+   case EGL_GL_BUFFER_OBJECT_MESA:
+     return dri2_acquire_buffer_object(disp, ctx, attrib_list, user_data);
+   case EGL_GL_RENDER_BUFFER_MESA:
+     return dri2_acquire_render_buffer(disp, ctx, attrib_list, user_data);
+   default:
+      fprintf(stderr, "bad resource target value 0x%04x",
+              target);
+   }
+   return EGL_FALSE;
+}
+
+static EGLBoolean
+dri2_release_resource_mesa(_EGLDisplay *disp, _EGLContext *ctx, const EGLenum target,
+                           const EGLint *attrib_list)
+{
+   switch (target) {
+   case EGL_GL_TEXTURE_MESA:
+     return dri2_release_texture(disp, ctx, attrib_list);
+   case EGL_GL_BUFFER_OBJECT_MESA:
+     return dri2_release_buffer_object(disp, ctx, attrib_list);
+   case EGL_GL_RENDER_BUFFER_MESA:
+     return dri2_release_render_buffer(disp, ctx, attrib_list);
+   default:
+      fprintf(stderr, "bad resource target value 0x%04x",
+              target);
+   }
+   return EGL_FALSE;
+}
+
+EGLBoolean
+eglAcquireResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list, void *user)
+{
+   _EGLDisplay *disp = _eglLockDisplay(dpy);
+   _EGLContext *context = _eglLookupContext(ctx, disp);
+
+   return dri2_acquire_resource_mesa(disp, context, target, attrib_list, user);
+}
+
+EGLBoolean
+eglReleaseResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list)
+{
+   _EGLDisplay *disp = _eglLockDisplay(dpy);
+   _EGLContext *context = _eglLookupContext(ctx, disp);
+
+   return dri2_release_resource_mesa(disp, context, target, attrib_list);
+}
diff --git a/src/x11/mesa_egl_extension.h b/src/x11/mesa_egl_extension.h
new file mode 100644
index 0000000..39ea134
--- /dev/null
+++ b/src/x11/mesa_egl_extension.h
@@ -0,0 +1,20 @@
+#ifndef __MESA_EGL_EXTENSION_H__
+#define __MESA_EGL_EXTENSION_H__
+
+#include <EGL/egl.h>
+#include <GL/gl.h>
+#include <GL/internal/dri_interface.h>
+
+#define EGL_GL_TEXTURE_MESA             0x3300  /* eglAcuireResource target */
+#define EGL_GL_BUFFER_OBJECT_MESA       0x3301  /* eglAcuireResource target */
+#define EGL_GL_RENDER_BUFFER_MESA       0x3302  /* eglAcuireResource target */
+#define EGL_GL_TEXTURE_ID_MESA          0x3303  /* eglAcuireResource attribute */
+#define EGL_GL_TEXTURE_LEVEL_MESA       0x3304  /* eglAcuireResource attribute */
+#define EGL_GL_TEXTURE_TARGET_MESA      0x3305  /* eglAcuireResource attribute */
+#define EGL_GL_BUFFER_OBJECT_ID_MESA    0x3306  /* eglAcuireResource attribute */
+#define EGL_GL_RENDER_BUFFER_ID_MESA    0x3307  /* eglAcuireResource attribute */
+
+EGLBoolean eglAcquireResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list, void * user_data);
+EGLBoolean eglReleaseResourceMESA(EGLDisplay dpy, EGLContext ctx, EGLenum target, const EGLint *attrib_list);
+
+#endif
diff --git a/src/x11/mesa_egl_res_share.c b/src/x11/mesa_egl_res_share.c
new file mode 100644
index 0000000..93e9454
--- /dev/null
+++ b/src/x11/mesa_egl_res_share.c
@@ -0,0 +1,135 @@
+/**************************************************************************
+ *
+ * Copyright 2013-2014 Zhigang Gong <zhigang.gong at linux.intel.com>
+ * Copyright 2013-2014 Intel, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <assert.h>
+#include <string.h>
+
+#include "mesa_egl_extension.h"
+#include "mesa_egl_res_share.h"
+
+/**
+ * Parse the list of share texture attributes and return the proper error code.
+ */
+EGLint
+_eglParseTextureAttribList(unsigned int *texture, EGLenum *gl_target, EGLint *level,
+                           const EGLint *attrib_list)
+{
+   EGLint i, err = EGL_SUCCESS;
+
+   *texture = 0;
+   *gl_target = 0;
+   *level = 0;
+
+   if (!attrib_list)
+      return EGL_BAD_ATTRIBUTE;
+
+   for (i = 0; attrib_list[i] != EGL_NONE; i++) {
+      EGLint attr = attrib_list[i++];
+      EGLint val = attrib_list[i];
+
+      switch (attr) {
+      case EGL_GL_TEXTURE_LEVEL_MESA:
+         *level = val;
+         break;
+      case EGL_GL_TEXTURE_ID_MESA:
+         *texture = val;
+         break;
+      case EGL_GL_TEXTURE_TARGET_MESA:
+         *gl_target = val;
+         break;
+      default:
+         /* unknown attrs are ignored */
+         break;
+      }
+   }
+
+   return err;
+}
+
+/**
+ * Parse the list of share texture attributes and return the proper error code.
+ */
+EGLint
+_eglParseBufferObjAttribList(unsigned int *bufobj, const EGLint *attrib_list)
+{
+   EGLint i, err = EGL_SUCCESS;
+   *bufobj = 0;
+
+   if (!attrib_list)
+      return EGL_BAD_ATTRIBUTE;
+
+   for (i = 0; attrib_list[i] != EGL_NONE; i++) {
+      EGLint attr = attrib_list[i++];
+      EGLint val = attrib_list[i];
+
+      switch (attr) {
+      case EGL_GL_BUFFER_OBJECT_ID_MESA:
+         *bufobj = val;
+         break;
+      default:
+         /* unknown attrs are ignored */
+         break;
+      }
+   }
+   if (*bufobj == 0)
+      err = EGL_BAD_ATTRIBUTE;
+
+   return err;
+}
+
+/**
+ * Parse the list of share texture attributes and return the proper error code.
+ */
+EGLint
+_eglParseRenderBufferAttribList(unsigned int *rb, const EGLint *attrib_list)
+{
+   EGLint i, err = EGL_SUCCESS;
+   *rb = 0;
+
+   if (!attrib_list)
+      return EGL_BAD_ATTRIBUTE;
+
+   for (i = 0; attrib_list[i] != EGL_NONE; i++) {
+      EGLint attr = attrib_list[i++];
+      EGLint val = attrib_list[i];
+
+      switch (attr) {
+      case EGL_GL_RENDER_BUFFER_ID_MESA:
+         *rb = val;
+         break;
+      default:
+         /* unknown attrs are ignored */
+         break;
+      }
+   }
+   if (*rb == 0)
+      err = EGL_BAD_ATTRIBUTE;
+
+   return err;
+}
diff --git a/src/x11/mesa_egl_res_share.h b/src/x11/mesa_egl_res_share.h
new file mode 100644
index 0000000..43e746e
--- /dev/null
+++ b/src/x11/mesa_egl_res_share.h
@@ -0,0 +1,44 @@
+/**************************************************************************
+ *
+ * Copyright 2013-2014 Zhigang Gong <zhigang.gong at linux.intel.com>
+ * Copyright 2013-2014 Intel, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef EGLRESSHARE_INCLUDED
+#define EGLRESSHARE_INCLUDED
+
+#include <EGL/egl.h>
+
+EGLint
+_eglParseTextureAttribList(unsigned int *texture, EGLenum *gl_target,
+                           EGLint *level, const EGLint *attrib_list);
+EGLint
+_eglParseBufferObjAttribList(unsigned int *bufobj,
+                             const EGLint *attrib_list);
+
+EGLint
+_eglParseRenderBufferAttribList(unsigned int *rb, const EGLint *attrib_list);
+#endif
diff --git a/src/x11/va_dri2.c b/src/x11/va_dri2.c
new file mode 100644
index 0000000..5225acd
--- /dev/null
+++ b/src/x11/va_dri2.c
@@ -0,0 +1,327 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright � 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Soft-
+ * ware"), to deal in the Software without restriction, including without
+ * limitation the rights to use, copy, modify, merge, publish, distribute,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, provided that the above copyright
+ * notice(s) and this permission notice appear in all copies of the Soft-
+ * ware and that both the above copyright notice(s) and this permission
+ * notice appear in supporting documentation.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
+ * ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY
+ * RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
+ * THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSE-
+ * QUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFOR-
+ * MANCE OF THIS SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or
+ * other dealings in this Software without prior written authorization of
+ * the copyright holder.
+ *
+ * Authors:
+ *   Kristian H�gsberg (krh at redhat.com)
+ */
+
+#define NEED_REPLIES
+#include <X11/Xlibint.h>
+#include <X11/extensions/Xext.h>
+#include <X11/extensions/extutil.h>
+#include "xf86drm.h"
+#include "x11/va_dri2.h"
+#include "x11/va_dri2str.h"
+#include "x11/va_dri2tokens.h"
+
+#ifndef DRI2DriverDRI
+#define DRI2DriverDRI 0
+#endif
+
+#define LOCAL __attribute__ ((visibility ("internal")))
+
+static char va_dri2ExtensionName[] = DRI2_NAME;
+static XExtensionInfo _va_dri2_info_data;
+static XExtensionInfo *va_dri2Info = &_va_dri2_info_data;
+static XEXT_GENERATE_CLOSE_DISPLAY (VA_DRI2CloseDisplay, va_dri2Info)
+static /* const */ XExtensionHooks va_dri2ExtensionHooks = {
+    NULL,				/* create_gc */
+    NULL,				/* copy_gc */
+    NULL,				/* flush_gc */
+    NULL,				/* free_gc */
+    NULL,				/* create_font */
+    NULL,				/* free_font */
+    VA_DRI2CloseDisplay,		/* close_display */
+    NULL,				/* wire_to_event */
+    NULL,				/* event_to_wire */
+    NULL,				/* error */
+    NULL,				/* error_string */
+};
+
+static XEXT_GENERATE_FIND_DISPLAY (DRI2FindDisplay, va_dri2Info, 
+				   va_dri2ExtensionName, 
+				   &va_dri2ExtensionHooks, 
+				   0, NULL)
+
+LOCAL Bool VA_DRI2QueryExtension(Display *dpy, int *eventBase, int *errorBase)
+{
+    XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+
+    if (XextHasExtension(info)) {
+	*eventBase = info->codes->first_event;
+	*errorBase = info->codes->first_error;
+	return True;
+    }
+
+    return False;
+}
+
+LOCAL Bool VA_DRI2QueryVersion(Display *dpy, int *major, int *minor)
+{
+    XExtDisplayInfo *info = DRI2FindDisplay (dpy);
+    xDRI2QueryVersionReply rep;
+    xDRI2QueryVersionReq *req;
+
+    XextCheckExtension (dpy, info, va_dri2ExtensionName, False);
+
+    LockDisplay(dpy);
+    GetReq(DRI2QueryVersion, req);
+    req->reqType = info->codes->major_opcode;
+    req->dri2Reqtype = X_DRI2QueryVersion;
+    req->majorVersion = DRI2_MAJOR;
+    req->minorVersion = DRI2_MINOR;
+    if (!_XReply(dpy, (xReply *)&rep, 0, xFalse)) {
+	UnlockDisplay(dpy);
+	SyncHandle();
+	return False;
+    }
+    *major = rep.majorVersion;
+    *minor = rep.minorVersion;
+    UnlockDisplay(dpy);
+    SyncHandle();
+
+    return True;
+}
+
+LOCAL Bool VA_DRI2Connect(Display *dpy, XID window,
+		 char **driverName, char **deviceName)
+{
+    XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+    xDRI2ConnectReply rep;
+    xDRI2ConnectReq *req;
+
+    XextCheckExtension (dpy, info, va_dri2ExtensionName, False);
+
+    LockDisplay(dpy);
+    GetReq(DRI2Connect, req);
+    req->reqType = info->codes->major_opcode;
+    req->dri2Reqtype = X_DRI2Connect;
+    req->window = window;
+    req->drivertype = DRI2DriverDRI;
+    if (!_XReply(dpy, (xReply *)&rep, 0, xFalse)) {
+	UnlockDisplay(dpy);
+	SyncHandle();
+	return False;
+    }
+
+    if (rep.driverNameLength == 0 && rep.deviceNameLength == 0) {
+	UnlockDisplay(dpy);
+	SyncHandle();
+	return False;
+    }
+
+    *driverName = Xmalloc(rep.driverNameLength + 1);
+    if (*driverName == NULL) {
+	_XEatData(dpy, 
+		  ((rep.driverNameLength + 3) & ~3) +
+		  ((rep.deviceNameLength + 3) & ~3));
+	UnlockDisplay(dpy);
+	SyncHandle();
+	return False;
+    }
+    _XReadPad(dpy, *driverName, rep.driverNameLength);
+    (*driverName)[rep.driverNameLength] = '\0';
+
+    *deviceName = Xmalloc(rep.deviceNameLength + 1);
+    if (*deviceName == NULL) {
+	Xfree(*driverName);
+	_XEatData(dpy, ((rep.deviceNameLength + 3) & ~3));
+	UnlockDisplay(dpy);
+	SyncHandle();
+	return False;
+    }
+    _XReadPad(dpy, *deviceName, rep.deviceNameLength);
+    (*deviceName)[rep.deviceNameLength] = '\0';
+
+    UnlockDisplay(dpy);
+    SyncHandle();
+
+    return True;
+}
+
+LOCAL Bool VA_DRI2Authenticate(Display *dpy, XID window, drm_magic_t magic)
+{
+    XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+    xDRI2AuthenticateReq *req;
+    xDRI2AuthenticateReply rep;
+
+    XextCheckExtension (dpy, info, va_dri2ExtensionName, False);
+
+    LockDisplay(dpy);
+    GetReq(DRI2Authenticate, req);
+    req->reqType = info->codes->major_opcode;
+    req->dri2Reqtype = X_DRI2Authenticate;
+    req->window = window;
+    req->magic = magic;
+
+    if (!_XReply(dpy, (xReply *)&rep, 0, xFalse)) {
+	UnlockDisplay(dpy);
+	SyncHandle();
+	return False;
+    }
+
+    UnlockDisplay(dpy);
+    SyncHandle();
+
+    return rep.authenticated;
+}
+
+LOCAL void VA_DRI2CreateDrawable(Display *dpy, XID drawable)
+{
+    XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+    xDRI2CreateDrawableReq *req;
+
+    XextSimpleCheckExtension (dpy, info, va_dri2ExtensionName);
+
+    LockDisplay(dpy);
+    GetReq(DRI2CreateDrawable, req);
+    req->reqType = info->codes->major_opcode;
+    req->dri2Reqtype = X_DRI2CreateDrawable;
+    req->drawable = drawable;
+    UnlockDisplay(dpy);
+    SyncHandle();
+}
+
+LOCAL void VA_DRI2DestroyDrawable(Display *dpy, XID drawable)
+{
+    XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+    xDRI2DestroyDrawableReq *req;
+
+    XextSimpleCheckExtension (dpy, info, va_dri2ExtensionName);
+
+    XSync(dpy, False);
+
+    LockDisplay(dpy);
+    GetReq(DRI2DestroyDrawable, req);
+    req->reqType = info->codes->major_opcode;
+    req->dri2Reqtype = X_DRI2DestroyDrawable;
+    req->drawable = drawable;
+    UnlockDisplay(dpy);
+    SyncHandle();
+}
+
+LOCAL VA_DRI2Buffer *VA_DRI2GetBuffers(Display *dpy, XID drawable,
+			   int *width, int *height,
+			   unsigned int *attachments, int count,
+			   int *outcount)
+{
+    XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+    xDRI2GetBuffersReply rep;
+    xDRI2GetBuffersReq *req;
+    VA_DRI2Buffer *buffers;
+    xDRI2Buffer repBuffer;
+    CARD32 *p;
+    int i;
+
+    XextCheckExtension (dpy, info, va_dri2ExtensionName, False);
+
+    LockDisplay(dpy);
+    GetReqExtra(DRI2GetBuffers, count * 4, req);
+    req->reqType = info->codes->major_opcode;
+    req->dri2Reqtype = X_DRI2GetBuffers;
+    req->drawable = drawable;
+    req->count = count;
+    p = (CARD32 *) &req[1];
+    for (i = 0; i < count; i++)
+	p[i] = attachments[i];
+
+    if (!_XReply(dpy, (xReply *)&rep, 0, xFalse)) {
+	UnlockDisplay(dpy);
+	SyncHandle();
+	return NULL;
+    }
+
+    *width = rep.width;
+    *height = rep.height;
+    *outcount = rep.count;
+
+    buffers = Xmalloc(rep.count * sizeof buffers[0]);
+    if (buffers == NULL) {
+	_XEatData(dpy, rep.count * sizeof repBuffer);
+	UnlockDisplay(dpy);
+	SyncHandle();
+	return NULL;
+    }
+
+    for (i = 0; i < (int) rep.count; i++) {
+	_XReadPad(dpy, (char *) &repBuffer, sizeof repBuffer);
+	buffers[i].attachment = repBuffer.attachment;
+	buffers[i].name = repBuffer.name;
+	buffers[i].pitch = repBuffer.pitch;
+	buffers[i].cpp = repBuffer.cpp;
+	buffers[i].flags = repBuffer.flags;
+    }
+
+    UnlockDisplay(dpy);
+    SyncHandle();
+
+    return buffers;
+}
+
+LOCAL void VA_DRI2CopyRegion(Display *dpy, XID drawable, XserverRegion region,
+		    CARD32 dest, CARD32 src)
+{
+    XExtDisplayInfo *info = DRI2FindDisplay(dpy);
+    xDRI2CopyRegionReq *req;
+    xDRI2CopyRegionReply rep;
+
+    XextSimpleCheckExtension (dpy, info, va_dri2ExtensionName);
+
+    LockDisplay(dpy);
+    GetReq(DRI2CopyRegion, req);
+    req->reqType = info->codes->major_opcode;
+    req->dri2Reqtype = X_DRI2CopyRegion;
+    req->drawable = drawable;
+    req->region = region;
+    req->dest = dest;
+    req->src = src;
+
+    _XReply(dpy, (xReply *)&rep, 0, xFalse);
+
+    UnlockDisplay(dpy);
+    SyncHandle();
+}
diff --git a/src/x11/va_dri2.h b/src/x11/va_dri2.h
new file mode 100644
index 0000000..1a1f96e
--- /dev/null
+++ b/src/x11/va_dri2.h
@@ -0,0 +1,89 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright � 2007,2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Soft-
+ * ware"), to deal in the Software without restriction, including without
+ * limitation the rights to use, copy, modify, merge, publish, distribute,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, provided that the above copyright
+ * notice(s) and this permission notice appear in all copies of the Soft-
+ * ware and that both the above copyright notice(s) and this permission
+ * notice appear in supporting documentation.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
+ * ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY
+ * RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
+ * THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSE-
+ * QUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFOR-
+ * MANCE OF THIS SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or
+ * other dealings in this Software without prior written authorization of
+ * the copyright holder.
+ *
+ * Authors:
+ *   Kristian H�gsberg (krh at redhat.com)
+ */
+#ifndef _VA_DRI2_H_
+#define _VA_DRI2_H_
+
+#include <X11/extensions/Xfixes.h>
+#include <X11/Xfuncproto.h>
+#include <xf86drm.h>
+
+typedef struct {
+    unsigned int attachment;
+    unsigned int name;
+    unsigned int pitch;
+    unsigned int cpp;
+    unsigned int flags;
+} VA_DRI2Buffer;
+
+extern Bool
+VA_DRI2QueryExtension(Display *display, int *eventBase, int *errorBase);
+extern Bool
+VA_DRI2QueryVersion(Display *display, int *major, int *minor);
+extern Bool
+VA_DRI2Connect(Display *display, XID window,
+	    char **driverName, char **deviceName);
+extern Bool
+VA_DRI2Authenticate(Display *display, XID window, drm_magic_t magic);
+extern void
+VA_DRI2CreateDrawable(Display *display, XID drawable);
+extern void
+VA_DRI2DestroyDrawable(Display *display, XID handle);
+extern VA_DRI2Buffer *
+VA_DRI2GetBuffers(Display *dpy, XID drawable,
+	       int *width, int *height,
+	       unsigned int *attachments, int count,
+	       int *outcount);
+#if 1
+extern void
+VA_DRI2CopyRegion(Display *dpy, XID drawable, XserverRegion region,
+	       CARD32 dest, CARD32 src);
+#endif
+#endif
diff --git a/src/x11/va_dri2str.h b/src/x11/va_dri2str.h
new file mode 100644
index 0000000..db10e16
--- /dev/null
+++ b/src/x11/va_dri2str.h
@@ -0,0 +1,211 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright � 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Soft-
+ * ware"), to deal in the Software without restriction, including without
+ * limitation the rights to use, copy, modify, merge, publish, distribute,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, provided that the above copyright
+ * notice(s) and this permission notice appear in all copies of the Soft-
+ * ware and that both the above copyright notice(s) and this permission
+ * notice appear in supporting documentation.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
+ * ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY
+ * RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
+ * THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSE-
+ * QUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFOR-
+ * MANCE OF THIS SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or
+ * other dealings in this Software without prior written authorization of
+ * the copyright holder.
+ *
+ * Authors:
+ *   Kristian H�gsberg (krh at redhat.com)
+ */
+#ifndef _DRI2_PROTO_H_
+#define _DRI2_PROTO_H_
+
+#define DRI2_NAME			"DRI2"
+#define DRI2_MAJOR			1
+#define DRI2_MINOR			0
+
+#define DRI2NumberErrors		0
+#define DRI2NumberEvents		0
+#define DRI2NumberRequests		7
+
+#define X_DRI2QueryVersion		0
+#define X_DRI2Connect			1
+#define X_DRI2Authenticate		2
+#define X_DRI2CreateDrawable		3
+#define X_DRI2DestroyDrawable		4
+#define X_DRI2GetBuffers		5
+#define X_DRI2CopyRegion		6
+
+typedef struct {
+    CARD32  attachment B32;
+    CARD32  name B32;
+    CARD32  pitch B32;
+    CARD32  cpp B32;
+    CARD32  flags B32;
+} xDRI2Buffer;
+
+typedef struct {
+    CARD8   reqType;
+    CARD8   dri2Reqtype;
+    CARD16  length B16;
+    CARD32  majorVersion B32;
+    CARD32  minorVersion B32;
+} xDRI2QueryVersionReq;
+#define sz_xDRI2QueryVersionReq   12
+
+typedef struct {
+    BYTE    type;   /* X_Reply */
+    BYTE    pad1;
+    CARD16  sequenceNumber B16;
+    CARD32  length B32;
+    CARD32  majorVersion B32;
+    CARD32  minorVersion B32;
+    CARD32  pad2 B32;
+    CARD32  pad3 B32;
+    CARD32  pad4 B32;
+    CARD32  pad5 B32;
+} xDRI2QueryVersionReply;
+#define sz_xDRI2QueryVersionReply	32
+
+typedef struct {
+    CARD8   reqType;
+    CARD8   dri2Reqtype;
+    CARD16  length B16;
+    CARD32  window B32;
+    CARD32  drivertype B32;
+} xDRI2ConnectReq;
+#define sz_xDRI2ConnectReq	12
+
+typedef struct {
+    BYTE    type;   /* X_Reply */
+    BYTE    pad1;
+    CARD16  sequenceNumber B16;
+    CARD32  length B32;
+    CARD32  driverNameLength B32;
+    CARD32  deviceNameLength B32;
+    CARD32  pad2 B32;
+    CARD32  pad3 B32;
+    CARD32  pad4 B32;
+    CARD32  pad5 B32;
+} xDRI2ConnectReply;
+#define sz_xDRI2ConnectReply	32
+
+typedef struct {
+    CARD8   reqType;
+    CARD8   dri2Reqtype;
+    CARD16  length B16;
+    CARD32  window B32;
+    CARD32  magic B32;
+} xDRI2AuthenticateReq;
+#define sz_xDRI2AuthenticateReq   12
+
+typedef struct {
+    BYTE    type;   /* X_Reply */
+    BYTE    pad1;
+    CARD16  sequenceNumber B16;
+    CARD32  length B32;
+    CARD32  authenticated B32;
+    CARD32  pad2 B32;
+    CARD32  pad3 B32;
+    CARD32  pad4 B32;
+    CARD32  pad5 B32;
+    CARD32  pad6 B32;
+} xDRI2AuthenticateReply;
+#define sz_xDRI2AuthenticateReply	32
+
+typedef struct {
+    CARD8   reqType;
+    CARD8   dri2Reqtype;
+    CARD16  length B16;
+    CARD32  drawable B32;
+} xDRI2CreateDrawableReq;
+#define sz_xDRI2CreateDrawableReq   8
+
+typedef struct {
+    CARD8   reqType;
+    CARD8   dri2Reqtype;
+    CARD16  length B16;
+    CARD32  drawable B32;
+} xDRI2DestroyDrawableReq;
+#define sz_xDRI2DestroyDrawableReq   8
+
+typedef struct {
+    CARD8   reqType;
+    CARD8   dri2Reqtype;
+    CARD16  length B16;
+    CARD32  drawable B32;
+    CARD32  count B32;
+} xDRI2GetBuffersReq;
+#define sz_xDRI2GetBuffersReq   12
+
+typedef struct {
+    BYTE    type;   /* X_Reply */
+    BYTE    pad1;
+    CARD16  sequenceNumber B16;
+    CARD32  length B32;
+    CARD32  width B32;
+    CARD32  height B32;
+    CARD32  count B32;
+    CARD32  pad2 B32;
+    CARD32  pad3 B32;
+    CARD32  pad4 B32;
+} xDRI2GetBuffersReply;
+#define sz_xDRI2GetBuffersReply	32
+
+typedef struct {
+    CARD8   reqType;
+    CARD8   dri2Reqtype;
+    CARD16  length B16;
+    CARD32  drawable B32;
+    CARD32  region B32;
+    CARD32  dest B32;
+    CARD32  src B32;
+} xDRI2CopyRegionReq;
+#define sz_xDRI2CopyRegionReq   20
+
+typedef struct {
+    BYTE    type;   /* X_Reply */
+    BYTE    pad1;
+    CARD16  sequenceNumber B16;
+    CARD32  length B32;
+    CARD32  pad2 B32;
+    CARD32  pad3 B32;
+    CARD32  pad4 B32;
+    CARD32  pad5 B32;
+    CARD32  pad6 B32;
+    CARD32  pad7 B32;
+} xDRI2CopyRegionReply;
+#define sz_xDRI2CopyRegionReply	32
+
+#endif
diff --git a/src/x11/va_dri2tokens.h b/src/x11/va_dri2tokens.h
new file mode 100644
index 0000000..d3c31f3
--- /dev/null
+++ b/src/x11/va_dri2tokens.h
@@ -0,0 +1,66 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/*
+ * Copyright � 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Soft-
+ * ware"), to deal in the Software without restriction, including without
+ * limitation the rights to use, copy, modify, merge, publish, distribute,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, provided that the above copyright
+ * notice(s) and this permission notice appear in all copies of the Soft-
+ * ware and that both the above copyright notice(s) and this permission
+ * notice appear in supporting documentation.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
+ * ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY
+ * RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
+ * THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSE-
+ * QUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFOR-
+ * MANCE OF THIS SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall
+ * not be used in advertising or otherwise to promote the sale, use or
+ * other dealings in this Software without prior written authorization of
+ * the copyright holder.
+ *
+ * Authors:
+ *   Kristian H�gsberg (krh at redhat.com)
+ */
+#ifndef _DRI2_TOKENS_H_
+#define _DRI2_TOKENS_H_
+
+#define DRI2BufferFrontLeft		0
+#define DRI2BufferBackLeft		1
+#define DRI2BufferFrontRight		2
+#define DRI2BufferBackRight		3
+#define DRI2BufferDepth			4
+#define DRI2BufferStencil		5
+#define DRI2BufferAccum			6
+#define DRI2BufferFakeFrontLeft		7
+#define DRI2BufferFakeFrontRight	8
+
+#define DRI2DriverDRI			0
+
+#endif
diff --git a/utests/.gitignore b/utests/.gitignore
new file mode 100644
index 0000000..90f80fc
--- /dev/null
+++ b/utests/.gitignore
@@ -0,0 +1,15 @@
+compiler_box_blur.bmp
+compiler_box_blur_float.bmp
+compiler_clod.bmp
+compiler_julia.bmp
+compiler_julia_no_break.bmp
+compiler_mandelbrot.bmp
+compiler_mandelbrot_alternate.bmp
+compiler_menger_sponge_no_shadow.bmp
+compiler_nautilus.bmp
+compiler_ribbon.bmp
+flat_address_space
+libutests.so
+utest_run
+generated
+utest_generator.pyc
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
new file mode 100644
index 0000000..9c531de
--- /dev/null
+++ b/utests/CMakeLists.txt
@@ -0,0 +1,241 @@
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../include)
+
+##### Math Function Part:
+EXEC_PROGRAM(mkdir ${CMAKE_CURRENT_SOURCE_DIR} ARGS generated -p)
+EXEC_PROGRAM(${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR} ARGS utest_math_gen.py OUTPUT_VARIABLE GEN_MATH_STRING)
+string(REGEX REPLACE " " ";" ADDMATHFUNC ${GEN_MATH_STRING})
+
+string(REGEX REPLACE "generated/([^\ ]*)\\.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../kernels/\\1.cl" KERNEL_MATH_LIST ${GEN_MATH_STRING})
+string(REGEX REPLACE " " ";" KERNEL_MATH_LIST ${KERNEL_MATH_LIST})
+string(REGEX REPLACE "generated/([^\ ]*)\\.cpp" "\\1.cl" KERNEL_GITIGNORE_LIST ${GEN_MATH_STRING})
+set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "generated;${KERNEL_MATH_LIST}")
+
+configure_file (
+  "setenv.sh.in"
+  "setenv.sh"
+  )
+
+#XXX only need GL if required
+link_directories (${LLVM_LIBRARY_DIR} ${OPENGL_LIBDIR} ${DRM_LIBDIR})
+set (utests_sources
+  utest_error.c
+  compiler_basic_arithmetic.cpp
+  compiler_displacement_map_element.cpp
+  compiler_shader_toy.cpp
+  compiler_mandelbrot.cpp
+  compiler_mandelbrot_alternate.cpp
+  compiler_box_blur_float.cpp
+  compiler_box_blur_image.cpp
+  compiler_box_blur.cpp
+  compiler_insert_to_constant.cpp
+  compiler_argument_structure.cpp
+  compiler_arith_shift_right.cpp
+  compiler_mixed_pointer.cpp
+  compiler_array0.cpp
+  compiler_array.cpp
+  compiler_array1.cpp
+  compiler_array2.cpp
+  compiler_array3.cpp
+  compiler_byte_scatter.cpp
+  compiler_ceil.cpp
+  compiler_clz_short.cpp
+  compiler_clz_int.cpp
+  compiler_convert_uchar_sat.cpp
+  compiler_copy_buffer.cpp
+  compiler_copy_image.cpp
+  compiler_copy_image_1d.cpp
+  compiler_copy_image_3d.cpp
+  compiler_copy_buffer_row.cpp
+  compiler_degrees.cpp
+  compiler_step.cpp
+  compiler_fabs.cpp
+  compiler_abs.cpp
+  compiler_abs_diff.cpp
+  compiler_fill_image.cpp
+  compiler_fill_image0.cpp
+  compiler_fill_image_1d.cpp
+  compiler_fill_image_3d.cpp
+  compiler_fill_image_3d_2.cpp
+  compiler_function_argument0.cpp
+  compiler_function_argument1.cpp
+  compiler_function_argument2.cpp
+  compiler_function_argument.cpp
+  compiler_function_constant0.cpp
+  compiler_function_constant1.cpp
+  compiler_function_constant.cpp
+  compiler_global_constant.cpp
+  compiler_global_constant_2.cpp
+  compiler_group_size.cpp
+  compiler_hadd.cpp
+  compiler_if_else.cpp
+  compiler_integer_division.cpp
+  compiler_integer_remainder.cpp
+  compiler_insert_vector.cpp
+  compiler_lower_return0.cpp
+  compiler_lower_return1.cpp
+  compiler_lower_return2.cpp
+  compiler_mad_hi.cpp
+  compiler_mul_hi.cpp
+  compiler_mad24.cpp
+  compiler_mul24.cpp
+  compiler_multiple_kernels.cpp
+  compiler_radians.cpp
+  compiler_rhadd.cpp
+  compiler_rotate.cpp
+  compiler_saturate.cpp
+  compiler_saturate_sub.cpp
+  compiler_shift_right.cpp
+  compiler_short_scatter.cpp
+  compiler_smoothstep.cpp
+  compiler_uint2_copy.cpp
+  compiler_uint3_copy.cpp
+  compiler_uint8_copy.cpp
+  compiler_uint16_copy.cpp
+  compiler_uint3_unaligned_copy.cpp
+  compiler_upsample_int.cpp
+  compiler_upsample_long.cpp
+  compiler_unstructured_branch0.cpp
+  compiler_unstructured_branch1.cpp
+  compiler_unstructured_branch2.cpp
+  compiler_unstructured_branch3.cpp
+  compiler_write_only_bytes.cpp
+  compiler_write_only.cpp
+  compiler_write_only_shorts.cpp
+  compiler_switch.cpp
+  compiler_math.cpp
+  compiler_atomic_functions.cpp
+  compiler_async_copy.cpp
+  compiler_async_stride_copy.cpp
+  compiler_insn_selection_min.cpp
+  compiler_insn_selection_max.cpp
+  compiler_insn_selection_masked_min_max.cpp
+  compiler_load_bool_imm.cpp
+  compiler_global_memory_barrier.cpp
+  compiler_local_memory_two_ptr.cpp
+  compiler_local_memory_barrier.cpp
+  compiler_local_memory_barrier_wg64.cpp
+  compiler_local_memory_barrier_2.cpp
+  compiler_local_slm.cpp
+  compiler_movforphi_undef.cpp
+  compiler_volatile.cpp
+  compiler_copy_image1.cpp
+  compiler_get_image_info.cpp
+  compiler_get_image_info_array.cpp
+  compiler_vect_compare.cpp
+  compiler_vector_load_store.cpp
+  compiler_vector_inc.cpp
+  compiler_cl_finish.cpp
+  get_cl_info.cpp
+  builtin_atan2.cpp
+  builtin_bitselect.cpp
+  builtin_frexp.cpp
+  builtin_mad_sat.cpp
+  builtin_modf.cpp
+  builtin_nextafter.cpp
+  builtin_remquo.cpp
+  builtin_shuffle.cpp
+  builtin_shuffle2.cpp
+  builtin_sign.cpp
+  builtin_lgamma.cpp
+  builtin_lgamma_r.cpp
+  builtin_tgamma.cpp
+  buildin_work_dim.cpp
+  builtin_global_size.cpp
+  builtin_local_size.cpp
+  builtin_global_id.cpp
+  builtin_num_groups.cpp
+  builtin_local_id.cpp
+  builtin_acos_asin.cpp
+  builtin_pow.cpp
+  builtin_exp.cpp
+  builtin_convert_sat.cpp
+  sub_buffer.cpp
+  runtime_createcontext.cpp
+  runtime_null_kernel_arg.cpp
+  runtime_event.cpp
+  runtime_barrier_list.cpp
+  runtime_marker_list.cpp
+  runtime_compile_link.cpp
+  compiler_long.cpp
+  compiler_long_2.cpp
+  compiler_long_convert.cpp
+  compiler_long_shl.cpp
+  compiler_long_shr.cpp
+  compiler_long_asr.cpp
+  compiler_long_mult.cpp
+  compiler_long_cmp.cpp
+  compiler_function_argument3.cpp
+  compiler_function_qualifiers.cpp
+  compiler_bool_cross_basic_block.cpp
+  compiler_private_data_overflow.cpp
+  compiler_getelementptr_bitcast.cpp
+  compiler_simd_any.cpp
+  compiler_simd_all.cpp
+  compiler_double_precision.cpp
+  load_program_from_bin_file.cpp
+  load_program_from_gen_bin.cpp
+  get_arg_info.cpp
+  profiling_exec.cpp
+  enqueue_copy_buf.cpp
+  enqueue_copy_buf_unaligned.cpp
+  test_printf.cpp
+  enqueue_fill_buf.cpp
+  enqueue_built_in_kernels.cpp
+  builtin_kernel_max_global_size.cpp
+  image_1D_buffer.cpp
+  compare_image_2d_and_1d_array.cpp
+  compiler_constant_expr.cpp
+  utest_assert.cpp
+  utest.cpp
+  utest_file_map.cpp
+  utest_helper.cpp)
+
+SET (kernel_bin ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/compiler_ceil)
+
+if(GEN_PCI_ID)
+  ADD_CUSTOM_COMMAND(
+  OUTPUT ${kernel_bin}.bin
+  COMMAND ${GBE_BIN_GENERATER} ${kernel_bin}.cl -o${kernel_bin}.bin -t${GEN_PCI_ID}
+  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl)
+else(GEN_PCI_ID)
+  ADD_CUSTOM_COMMAND(
+  OUTPUT ${kernel_bin}.bin
+  COMMAND ${GBE_BIN_GENERATER} ${kernel_bin}.cl -o${kernel_bin}.bin
+  DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl)
+endif(GEN_PCI_ID)
+
+ADD_CUSTOM_TARGET(kernel_bin.bin
+    DEPENDS ${kernel_bin}.bin)
+
+add_custom_command(OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/generated
+    COMMAND mkdir ${CMAKE_CURRENT_SOURCE_DIR}/generated -p
+    COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/utest_math_gen.py > /dev/null 2>&1
+    COMMAND echo ${KERNEL_GITIGNORE_LIST} |sed 's/ /\\n/g' > ${CMAKE_CURRENT_SOURCE_DIR}/../kernels/.gitignore
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    )
+add_custom_target(utest_generator
+    DEPENDS generated
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    )
+
+if (EGL_FOUND AND MESA_SOURCE_FOUND)
+SET(utests_sources ${utests_sources} compiler_fill_gl_image.cpp)
+SET(CMAKE_CXX_FLAGS "-DHAS_EGL ${CMAKE_CXX_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
+SET(CMAKE_C_FLAGS "-DHAS_EGL ${CMAKE_C_FLAGS} ${DEF_OCL_PCH_PCM_PATH}")
+SET(UTESTS_REQUIRED_EGL_LIB ${EGL_LIBRARIES})
+else()
+SET(UTESTS_REQUIRED_EGL_LIB "")
+endif()
+
+ADD_LIBRARY(utests SHARED ${ADDMATHFUNC} ${utests_sources})
+
+TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} ${UTESTS_REQUIRED_EGL_LIB} ${CMAKE_THREAD_LIBS_INIT})
+
+ADD_EXECUTABLE(utest_run utest_run.cpp)
+TARGET_LINK_LIBRARIES(utest_run utests)
+ADD_DEPENDENCIES (utest_run kernel_bin.bin)
+ADD_DEPENDENCIES (utests utest_generator)
+
+ADD_EXECUTABLE(flat_address_space runtime_flat_address_space.cpp)
+TARGET_LINK_LIBRARIES(flat_address_space utests)
diff --git a/utests/buildin_work_dim.cpp b/utests/buildin_work_dim.cpp
new file mode 100644
index 0000000..d678c0f
--- /dev/null
+++ b/utests/buildin_work_dim.cpp
@@ -0,0 +1,37 @@
+#include "utest_helper.hpp"
+
+static void buildin_work_dim(void)
+{
+  // Setup kernel and buffers
+
+  int result, err;
+  OCL_CREATE_KERNEL("buildin_work_dim");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  globals[0] = 1;
+  globals[1] = 1;
+  globals[2] = 1;
+  locals[0] = 1;
+  locals[1] = 1;
+  locals[2] = 1;
+
+  for( int i=1; i <= 3; i++ )
+  {
+
+    // Run the kernel
+    OCL_NDRANGE(i);
+
+    err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &result, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+    {
+       printf("Error: Failed to read output array! %d\n", err);
+       exit(1);
+    }
+
+    OCL_ASSERT( result == i);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(buildin_work_dim);
diff --git a/utests/builtin_acos_asin.cpp b/utests/builtin_acos_asin.cpp
new file mode 100644
index 0000000..0187226
--- /dev/null
+++ b/utests/builtin_acos_asin.cpp
@@ -0,0 +1,87 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+#define udebug 0
+#define printf_c(...) \
+{\
+  printf("\033[1m\033[40;31m");\
+  printf( __VA_ARGS__ );\
+  printf("\033[0m");\
+}
+
+const float input_data[] = {-30, -1, -0.92, -0.5, -0.09, 0, 0.09, 0.5, 0.92, 1, 30};
+const int count_input = sizeof(input_data) / sizeof(input_data[0]);
+const int max_function = 5;
+
+static void cpu_compiler_math(float *dst, const float *src)
+{
+  const float x = *src;
+
+  dst[0] = acos(x);
+  dst[1] = acosh(x);
+  dst[2] = asin(x);
+  dst[3] = asinh(x);
+  dst[4] = x;
+}
+
+static void builtin_acos_asin(void)
+{
+  // Setup kernel and buffers
+  int k, i, index_cur;
+  float gpu_data[max_function * count_input] = {0}, cpu_data[max_function * count_input] = {0};
+
+  OCL_CREATE_KERNEL("builtin_acos_asin");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, count_input * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+  globals[0] = count_input;
+  locals[0] = 1;
+
+  clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * sizeof(float), input_data, 0, NULL, NULL);
+  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), &max_function , 0, NULL, NULL);
+
+   // Run the kernel
+  OCL_NDRANGE( 1 );
+
+  clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(float) * max_function * count_input, gpu_data, 0, NULL, NULL);
+
+  for (k = 0; (uint)k < count_input; k++)
+  {
+    cpu_compiler_math( cpu_data + k * max_function, input_data + k);
+
+    for (i = 0; i < max_function; i++)
+    {
+      index_cur = k * max_function + i;
+#if udebug
+      if (isinf(cpu_data[index_cur]) && !isinf(gpu_data[index_cur])){
+        printf_c("%d/%d: %f -> gpu:%f  cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+      }
+      else if (isnan(cpu_data[index_cur]) && !isnan(gpu_data[index_cur])){
+        printf_c("%d/%d: %f -> gpu:%f  cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+      }
+      else if(fabs(gpu_data[index_cur] - cpu_data[index_cur]) > 1e-3f){
+        printf_c("%d/%d: %f -> gpu:%f  cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+      }
+      else
+        printf("%d/%d: %f -> gpu:%f  cpu:%f\n", k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur]);
+#else
+     if (isinf(cpu_data[index_cur]))
+       OCL_ASSERT(isinf(gpu_data[index_cur]));
+     else if (isnan(cpu_data[index_cur]))
+       OCL_ASSERT(isnan(gpu_data[index_cur]));
+     else
+     {
+       OCL_ASSERT(fabs(gpu_data[index_cur] - cpu_data[index_cur]) < 1e-3f);
+     }
+#endif
+    }
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_acos_asin)
diff --git a/utests/builtin_atan2.cpp b/utests/builtin_atan2.cpp
new file mode 100644
index 0000000..29dd7b4
--- /dev/null
+++ b/utests/builtin_atan2.cpp
@@ -0,0 +1,43 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_atan2(void) {
+	const int n = 1024;
+	float y[n], x[n];
+
+	// Setup kernel and buffers
+	OCL_CREATE_KERNEL("builtin_atan2");
+	OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+	OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+	OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+	OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+	OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+	OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+	globals[0] = n;
+	locals[0] = 16;
+
+	OCL_MAP_BUFFER(0);
+	OCL_MAP_BUFFER(1);
+	for (int i = 0; i < n; ++i) {
+		y[i] = ((float*) buf_data[0])[i] = (rand()&255) * 0.01f;
+		x[i] = ((float*) buf_data[1])[i] = (rand()&255) * 0.01f;
+	}
+	OCL_UNMAP_BUFFER(0);
+	OCL_UNMAP_BUFFER(1);
+
+	OCL_NDRANGE(1);
+
+	OCL_MAP_BUFFER(2);
+	float *dst = (float*) buf_data[2];
+	for (int i = 0; i < n; ++i) {
+		float cpu = atan2f(y[i], x[i]);
+		float gpu = dst[i];
+		if (fabsf(cpu - gpu) >= 1e-2) {
+			printf("%f %f %f %f\n", y[i], x[i], cpu, gpu);
+			OCL_ASSERT(0);
+		}
+	}
+	OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION (builtin_atan2);
diff --git a/utests/builtin_bitselect.cpp b/utests/builtin_bitselect.cpp
new file mode 100644
index 0000000..37fb8df
--- /dev/null
+++ b/utests/builtin_bitselect.cpp
@@ -0,0 +1,50 @@
+#include "utest_helper.hpp"
+
+int as_int(float f) {
+  void *p = &f;
+  return *(int *)p;
+}
+
+int cpu(int a, int b, int c) {
+  return (a & ~c) | (b & c);
+}
+
+void builtin_bitselect(void)
+{
+  const int n = 32;
+  float src1[n], src2[n], src3[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_bitselect");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((float*)buf_data[0])[i] = rand() * 0.1f;
+    src2[i] = ((float*)buf_data[1])[i] = rand() * 0.1f;
+    src3[i] = ((float*)buf_data[2])[i] = rand() * 0.1f;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(3);
+  for (int i = 0; i < n; ++i)
+    OCL_ASSERT(((int*)buf_data[3])[i] == cpu(as_int(src1[i]), as_int(src2[i]), as_int(src3[i])));
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_bitselect);
diff --git a/utests/builtin_convert_sat.cpp b/utests/builtin_convert_sat.cpp
new file mode 100644
index 0000000..7272057
--- /dev/null
+++ b/utests/builtin_convert_sat.cpp
@@ -0,0 +1,80 @@
+#include <cstdint>
+#include "utest_helper.hpp"
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+int64_t my_rand(void) {
+  int64_t x = rand() - RAND_MAX/2;
+  int64_t y = rand() - RAND_MAX/2;
+  return x * y;
+}
+
+#define DEF2(DST_TYPE, SRC_TYPE, DST_MIN, DST_MAX, REAL_SRC_TYPE) \
+void builtin_convert_ ## SRC_TYPE ## _to_ ## DST_TYPE ## _sat(void) \
+{ \
+  const int n = 128; \
+  OCL_CREATE_KERNEL_FROM_FILE("builtin_convert_sat", "builtin_convert_" # SRC_TYPE "_to_" # DST_TYPE "_sat"); \
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(REAL_SRC_TYPE), NULL); \
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(DST_TYPE), NULL); \
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); \
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); \
+  globals[0] = n; \
+  locals[0] = 16; \
+  OCL_MAP_BUFFER(0); \
+  for (int i = 0; i < n; i++) \
+    ((REAL_SRC_TYPE *)buf_data[0])[i] = my_rand(); \
+  OCL_UNMAP_BUFFER(0); \
+  OCL_NDRANGE(1); \
+  OCL_MAP_BUFFER(0); \
+  OCL_MAP_BUFFER(1); \
+  for (int i = 0; i < n; i++) { \
+    REAL_SRC_TYPE src = ((REAL_SRC_TYPE *)buf_data[0])[i]; \
+    DST_TYPE dst; \
+    if ((double)src > (double)DST_MAX) \
+      dst = DST_MAX; \
+    else if ((double)src < (double)DST_MIN) \
+      dst = DST_MIN; \
+    else \
+      dst = src; \
+    OCL_ASSERT(((DST_TYPE *)buf_data[1])[i] == dst); \
+  } \
+  OCL_UNMAP_BUFFER(0); \
+  OCL_UNMAP_BUFFER(1); \
+} \
+MAKE_UTEST_FROM_FUNCTION(builtin_convert_ ## SRC_TYPE ## _to_ ## DST_TYPE ## _sat);
+
+#define DEF(DST_TYPE, SRC_TYPE, DST_MIN, DST_MAX) \
+  DEF2(DST_TYPE, SRC_TYPE, DST_MIN, DST_MAX, SRC_TYPE)
+
+DEF(char, uchar, -128, 127);
+DEF(char, short, -128, 127);
+DEF(char, ushort, -128, 127);
+DEF(char, int, -128, 127);
+DEF(char, uint, -128, 127);
+DEF2(char, long, -128, 127, int64_t);
+DEF(char, float, -128, 127);
+DEF(uchar, char, 0, 255);
+DEF(uchar, short, 0, 255);
+DEF(uchar, ushort, 0, 255);
+DEF(uchar, int, 0, 255);
+DEF(uchar, uint, 0, 255);
+DEF2(uchar, long, 0, 255, int64_t);
+DEF(uchar, float, 0, 255);
+DEF(short, ushort, -32768, 32767);
+DEF(short, int, -32768, 32767);
+DEF(short, uint, -32768, 32767);
+DEF2(short, long, -32768, 32767, int64_t);
+DEF(short, float, -32768, 32767);
+DEF(ushort, short, 0, 65535);
+DEF(ushort, int, 0, 65535);
+DEF(ushort, uint, 0, 65535);
+DEF2(ushort, long, 0, 65535, int64_t);
+DEF(ushort, float, 0, 65535);
+DEF(int, uint, -0x7FFFFFFF-1, 0x7FFFFFFF);
+DEF2(int, long, -0x7FFFFFFF-1, 0x7FFFFFFF, int64_t);
+DEF(int, float, -0x7FFFFFFF-1, 0x7FFFFFFF);
+DEF(uint, int, 0, 0xffffffffu);
+DEF2(uint, long, 0, 0xffffffffu, int64_t);
+DEF(uint, float, 0, 0xffffffffu);
+#undef DEF
diff --git a/utests/builtin_exp.cpp b/utests/builtin_exp.cpp
new file mode 100644
index 0000000..d5288c8
--- /dev/null
+++ b/utests/builtin_exp.cpp
@@ -0,0 +1,102 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+#define udebug 0
+
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define FLT_ULP  (1.0e-6f)
+
+#define printf_c(...) \
+{\
+  printf("\033[1m\033[40;31m");\
+  printf( __VA_ARGS__ );\
+  printf("\033[0m");\
+}
+
+const float input_data[] = {FLT_MAX, -FLT_MAX, FLT_MIN, -FLT_MIN, 80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0 };
+const int count_input = sizeof(input_data) / sizeof(input_data[0]);
+const int max_function = 5;
+
+static void cpu_compiler_math(float *dst, const float *src)
+{
+  const float x = *src;
+
+  dst[0] = exp(x);
+  dst[1] = exp2(x);
+  dst[2] = exp10(x);
+  dst[3] = expm1(x);
+  dst[4] = x;
+}
+
+static void builtin_exp(void)
+{
+  // Setup kernel and buffers
+  int k, i, index_cur;
+  float gpu_data[max_function * count_input] = {0}, cpu_data[max_function * count_input] = {0};
+  float diff;
+  char log[256] = {0};
+
+  OCL_CREATE_KERNEL("builtin_exp");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, count_input * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+  globals[0] = count_input;
+  locals[0] = 1;
+
+  clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * sizeof(float), input_data, 0, NULL, NULL);
+  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, sizeof(int), &max_function , 0, NULL, NULL);
+
+   // Run the kernel
+  OCL_NDRANGE( 1 );
+
+  clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(float) * max_function * count_input, gpu_data, 0, NULL, NULL);
+
+  for (k = 0; (uint)k < count_input; k++)
+  {
+    cpu_compiler_math( cpu_data + k * max_function, input_data + k);
+
+    for (i = 0; i < max_function; i++)
+    {
+      index_cur = k * max_function + i;
+      diff = fabs(gpu_data[index_cur]-cpu_data[index_cur]);
+      sprintf(log, "%d/%d: %f -> gpu:%f  cpu:%f diff:%f expect:%f\n", \
+         k, i, input_data[k], gpu_data[index_cur], cpu_data[index_cur], \
+         diff/gpu_data[index_cur], 3 * FLT_ULP);
+
+#if udebug
+      if (isinf(cpu_data[index_cur]) && isinf(gpu_data[index_cur])){
+        printf(log);
+      }
+      else if (isnan(cpu_data[index_cur]) && isnan(gpu_data[index_cur])){
+        printf(log);
+      }
+      else if( diff / cpu_data[index_cur] < 3 * FLT_ULP \
+        && ( gpu_data[index_cur] > FLT_ULP  || cpu_data[index_cur] > FLT_ULP )){
+        printf(log);
+      }
+      else if ( gpu_data[index_cur] < FLT_ULP && gpu_data[index_cur] < FLT_ULP)
+        printf(log);
+      else
+        printf_c(log);
+#else
+      if (isinf(cpu_data[index_cur]))
+        OCL_ASSERTM(isinf(gpu_data[index_cur]), log);
+      else if (isnan(cpu_data[index_cur]))
+        OCL_ASSERTM(isnan(gpu_data[index_cur]), log);
+      else if ( gpu_data[index_cur] > FLT_ULP || cpu_data[index_cur] > FLT_ULP)
+        OCL_ASSERTM(fabs( diff / cpu_data[index_cur]) < 3 * FLT_ULP, log);
+      else
+        OCL_ASSERTM(fabs(diff) < 3 * FLT_ULP, log);
+#endif
+    }
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_exp)
diff --git a/utests/builtin_frexp.cpp b/utests/builtin_frexp.cpp
new file mode 100644
index 0000000..75dac3b
--- /dev/null
+++ b/utests/builtin_frexp.cpp
@@ -0,0 +1,50 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_frexp(void)
+{
+  const int n = 32;
+  float src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_frexp");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  src[0] = ((float*)buf_data[0])[0] = 0.f;
+  src[1] = ((float*)buf_data[0])[1] = -0.f;
+  src[2] = ((float*)buf_data[0])[2] = nanf("");
+  src[3] = ((float*)buf_data[0])[3] = INFINITY;
+  src[4] = ((float*)buf_data[0])[4] = -INFINITY;
+  for (int i = 5; i < n; ++i)
+    src[i] = ((float*)buf_data[0])[i] = (rand() & 255) * 0.1f - 12.8f;
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  float *dst = (float*)buf_data[1];
+  int *exp = (int*)buf_data[2];
+  int w;
+  OCL_ASSERT(dst[0] == 0.f && exp[0] == 0);
+  OCL_ASSERT(dst[1] == -0.f && exp[1] == 0);
+  OCL_ASSERT(isnanf(dst[2]));
+  OCL_ASSERT(dst[3] == INFINITY);
+  OCL_ASSERT(dst[4] == -INFINITY);
+  for (int i = 5; i < n; ++i) {
+    OCL_ASSERT(fabsf(dst[i] - frexpf(src[i], &w)) < 1e-5);
+    OCL_ASSERT(exp[i] == w);
+  }
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_frexp);
diff --git a/utests/builtin_global_id.cpp b/utests/builtin_global_id.cpp
new file mode 100644
index 0000000..9601cab
--- /dev/null
+++ b/utests/builtin_global_id.cpp
@@ -0,0 +1,77 @@
+/*
+According to the OpenCL v1.1 & v1.2 chapter 6.11.
+Now define global size as following:
+  globals[0] = 3;
+  globals[1] = 4;
+  globals[2] = 5;
+
+Kernel:
+id = get_global_id(0) + get_global_id(1)*3 + get_global_id(2)*3*4
+
+dimension:1
+ 0  1  2
+dimension:2
+ 0  1  2
+ 3  4  5
+ 6  7  8
+ 9 10 11
+dimension:3
+ 0  1  2   12 13 14   24 25 26   36 37 38   48 49 50
+ 3  4  5   15 16 17   27 28 29   39 40 41   51 52 53
+ 6  7  8   18 19 20   30 31 32   42 43 44   54 55 56
+ 9 10 11   21 22 23   33 34 35   45 46 47   57 58 59
+*/
+
+#define udebug 0
+#include "utest_helper.hpp"
+static void builtin_global_id(void)
+{
+
+  // Setup kernel and buffers
+  int dim, global_id[80], err, i, buf_len=1;
+  OCL_CREATE_KERNEL("builtin_global_id");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*80, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  for( dim=1; dim <= 3; dim++ )
+  {
+    buf_len = 1;
+    for(i=1; i <= dim; i++)
+    {
+      globals[i - 1] = 2 + i;
+      locals[i - 1] = 2 + i;
+      buf_len *= 2 + i;
+    }
+    for(i=dim+1; i <= 3; i++)
+    {
+      globals[i - 1] = 0;
+      locals[i - 1] = 0;
+    }
+
+    // Run the kernel
+    OCL_NDRANGE( dim );
+    clFinish(queue);
+
+    err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int) * buf_len, &global_id, 0, NULL, NULL);
+
+    if (err != CL_SUCCESS)
+    {
+      printf("Error: Failed to read output array! %d\n", err);
+      exit(1);
+    }
+
+#if udebug
+    for(i = 0; i < buf_len; i++)
+    {
+      printf("%2d ", global_id[i]);
+      if ((i + 1) % 3 == 0) printf("\n");
+    }
+#endif
+
+    for( i = 0; i < buf_len; i++)
+      OCL_ASSERT( global_id[i] == i);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_global_id);
diff --git a/utests/builtin_global_size.cpp b/utests/builtin_global_size.cpp
new file mode 100644
index 0000000..094e019
--- /dev/null
+++ b/utests/builtin_global_size.cpp
@@ -0,0 +1,108 @@
+/*
+According to the OpenCL v1.1 & v1.2 chapter 6.11, the behavior of function get_global_size should be as following:
+
+  globals[0] = 3;
+  globals[1] = 4;
+  globals[2] = 5;
+
+#ifdef CL_VERSION_1_2 | CL_VERSION_1_1:
+get_global_size(-1) = 1 (dimension:1)
+get_global_size(0) = 3 (dimension:1)
+get_global_size(1) = 1 (dimension:1)
+get_global_size(2) = 1 (dimension:1)
+
+get_global_size(-1) = 1 (dimension:2)
+get_global_size(0) = 3 (dimension:2)
+get_global_size(1) = 4 (dimension:2)
+get_global_size(2) = 1 (dimension:2)
+get_global_size(3) = 1 (dimension:2)
+
+get_global_size(-1) = 1 (dimension:3)
+get_global_size(0) = 3 (dimension:3)
+get_global_size(1) = 4 (dimension:3)
+get_global_size(2) = 5 (dimension:3)
+get_global_size(3) = 1 (dimension:3)
+get_global_size(4) = 1 (dimension:3)
+
+#ifdef CL_VERSION_1_0:
+get_global_size(-1) = 0 (dimension:1)
+get_global_size(0) = 3 (dimension:1)
+get_global_size(1) = 0 (dimension:1)
+get_global_size(2) = 0 (dimension:1)
+
+get_global_size(-1) = 0 (dimension:2)
+get_global_size(0) = 3 (dimension:2)
+get_global_size(1) = 4 (dimension:2)
+get_global_size(2) = 0 (dimension:2)
+get_global_size(3) = 1 (dimension:2)
+
+get_global_size(-1) = 0 (dimension:3)
+get_global_size(0) = 3 (dimension:3)
+get_global_size(1) = 4 (dimension:3)
+get_global_size(2) = 5 (dimension:3)
+get_global_size(3) = 0 (dimension:3)
+get_global_size(4) = 0 (dimension:3)
+
+*/
+#include "utest_helper.hpp"
+static void builtin_global_size(void)
+{
+
+  // Setup kernel and buffers
+  int dim, dim_arg_global, global_size, err;
+  OCL_CREATE_KERNEL("builtin_global_size");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  globals[0] = 3;
+  globals[1] = 4;
+  globals[2] = 5;
+  locals[0] = 1;
+  locals[1] = 1;
+  locals[2] = 1;
+
+  for( dim=1; dim <= 3; dim++ )
+  {
+
+    for( dim_arg_global = -1; dim_arg_global <= dim + 1; dim_arg_global++ )
+    {
+
+      err = clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, sizeof(int), &dim_arg_global, 0, NULL, NULL);
+      if (err != CL_SUCCESS)
+      {
+        printf("Error: Failed to write to source array!\n");
+        exit(1);
+      }
+
+      // Run the kernel
+      OCL_NDRANGE( dim );
+
+      err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &global_size, 0, NULL, NULL);
+      if (err != CL_SUCCESS)
+      {
+        printf("Error: Failed to read output array! %d\n", err);
+        exit(1);
+      }
+
+      //printf("get_global_size(%d) = %d (dimension:%d)\n", dim_arg_global, global_size, dim);
+
+      if ( dim_arg_global >= 0 && dim_arg_global < dim)
+        OCL_ASSERT( global_size == dim_arg_global + 3);
+      else
+      {
+      #if defined(CL_VERSION_1_2) || defined(CL_VERSION_1_1)
+        OCL_ASSERT( global_size == 1);
+      #elif defined(CL_VERSION_1_0)
+        OCL_ASSERT( global_size == 0);
+      #else
+        OCL_ASSERT( global_size == 1);
+      #endif
+      }
+    }
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_global_size);
diff --git a/utests/builtin_kernel_max_global_size.cpp b/utests/builtin_kernel_max_global_size.cpp
new file mode 100644
index 0000000..c777564
--- /dev/null
+++ b/utests/builtin_kernel_max_global_size.cpp
@@ -0,0 +1,30 @@
+#include "utest_helper.hpp"
+
+void builtin_kernel_max_global_size(void)
+{
+  char* built_in_kernel_names;
+  size_t built_in_kernels_size;
+  cl_int err = CL_SUCCESS;
+  size_t ret_sz;
+
+
+  OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, 0, 0, &built_in_kernels_size);
+  built_in_kernel_names = (char* )malloc(built_in_kernels_size * sizeof(char) );
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, built_in_kernels_size, (void*)built_in_kernel_names, &ret_sz);
+  OCL_ASSERT(ret_sz == built_in_kernels_size);
+  cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, &device, built_in_kernel_names, &err);
+  OCL_ASSERT(built_in_prog != NULL);
+  cl_kernel builtin_kernel_1d = clCreateKernel(built_in_prog, "__cl_copy_region_unalign_src_offset",  &err);
+  OCL_ASSERT(builtin_kernel_1d != NULL);
+  size_t param_value_size;
+  void* param_value;
+  clGetKernelWorkGroupInfo(builtin_kernel_1d, device, CL_KERNEL_GLOBAL_WORK_SIZE, 0, NULL, &param_value_size);
+  param_value = malloc(param_value_size);
+  clGetKernelWorkGroupInfo(builtin_kernel_1d, device, CL_KERNEL_GLOBAL_WORK_SIZE, param_value_size, param_value, 0);
+  OCL_ASSERT(*(size_t*)param_value == 256 * 1024 *1024);
+  clReleaseKernel(builtin_kernel_1d);
+  clReleaseProgram(built_in_prog);
+  free(param_value);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_kernel_max_global_size);
diff --git a/utests/builtin_lgamma.cpp b/utests/builtin_lgamma.cpp
new file mode 100644
index 0000000..876699a
--- /dev/null
+++ b/utests/builtin_lgamma.cpp
@@ -0,0 +1,40 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_lgamma(void) {
+	const int n = 1024;
+	float src[n];
+
+	// Setup kernel and buffers
+	OCL_CREATE_KERNEL("builtin_lgamma");
+	OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+	OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+	OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+	OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+	globals[0] = n;
+	locals[0] = 16;
+
+	for (int j = 0; j < 1024; j++) {
+		OCL_MAP_BUFFER(0);
+		for (int i = 0; i < n; ++i) {
+			src[i] = ((float*) buf_data[0])[i] = (j * n + i + 1) * 0.001f;
+		}
+		OCL_UNMAP_BUFFER(0);
+
+		OCL_NDRANGE(1);
+
+		OCL_MAP_BUFFER(1);
+		float *dst = (float*) buf_data[1];
+		for (int i = 0; i < n; ++i) {
+			float cpu = lgamma(src[i]);
+			float gpu = dst[i];
+			if (fabsf(cpu - gpu) >= 1e-3) {
+				printf("%f %f %f\n", src[i], cpu, gpu);
+				OCL_ASSERT(0);
+			}
+		}
+		OCL_UNMAP_BUFFER(1);
+	}
+}
+
+MAKE_UTEST_FROM_FUNCTION (builtin_lgamma);
diff --git a/utests/builtin_lgamma_r.cpp b/utests/builtin_lgamma_r.cpp
new file mode 100644
index 0000000..b6e5d0e
--- /dev/null
+++ b/utests/builtin_lgamma_r.cpp
@@ -0,0 +1,46 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_lgamma_r(void) {
+	const int n = 1024;
+	float src[n];
+
+	// Setup kernel and buffers
+	OCL_CREATE_KERNEL("builtin_lgamma_r");
+	OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+	OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+	OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+	OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+	OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+	OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+	globals[0] = n;
+	locals[0] = 16;
+
+	for (int j = 0; j < 1024; j++) {
+		OCL_MAP_BUFFER(0);
+		for (int i = 0; i < n; ++i) {
+			src[i] = ((float*) buf_data[0])[i] = (j * n + i + 1) * 0.001f;
+		}
+		OCL_UNMAP_BUFFER(0);
+
+		OCL_NDRANGE(1);
+
+		OCL_MAP_BUFFER(1);
+		OCL_MAP_BUFFER(2);
+		float *dst = (float*) buf_data[1];
+		for (int i = 0; i < n; ++i) {
+			int cpu_signp;
+			float cpu = lgamma_r(src[i], &cpu_signp);
+			int gpu_signp = ((int*)buf_data[2])[i];
+			float gpu = dst[i];
+			if (cpu_signp != gpu_signp || fabsf(cpu - gpu) >= 1e-3) {
+				printf("%f %f %f\n", src[i], cpu, gpu);
+				OCL_ASSERT(0);
+			}
+		}
+		OCL_UNMAP_BUFFER(1);
+		OCL_UNMAP_BUFFER(2);
+	}
+}
+
+MAKE_UTEST_FROM_FUNCTION (builtin_lgamma_r);
diff --git a/utests/builtin_local_id.cpp b/utests/builtin_local_id.cpp
new file mode 100644
index 0000000..1f07615
--- /dev/null
+++ b/utests/builtin_local_id.cpp
@@ -0,0 +1,81 @@
+/*
+According to the OpenCL v1.1 & v1.2 chapter 6.11.
+Now define local and global size as following:
+  globals[0] = 4;
+  globals[1] = 9;
+  globals[2] = 16;
+  locals[0] = 2;
+  locals[1] = 3;
+  locals[2] = 4;
+
+Kernel:
+int id = get_local_id(0) +  get_group_id(0)*2 + \
+         get_local_id(1) * 4 + get_group_id(1)*12 +\
+         get_local_id(2) *36 + get_group_id(2)*144;
+
+dimension:1
+ 0  1  2  3
+dimension:2
+ 0  1  2  3  4  5  6  7  8  9 10 11
+12 13 14 15 16 17 18 19 20 21 22 23
+24 25 26 27 28 29 30 31 32 33 34 35
+dimension:3
+ 0  1  2  3  4  5  6  7 ... 139 140 141 142 143
+...
+...
+429 430 431 432 433 434 ... 571 572 573 574 575
+*/
+
+#define udebug 0
+#include "utest_helper.hpp"
+static void builtin_local_id(void)
+{
+
+  // Setup kernel and buffers
+  int dim, local_id[576], err, i, buf_len=1;
+  OCL_CREATE_KERNEL("builtin_local_id");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int)*576, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  for( dim=1; dim <= 3; dim++ )
+  {
+    buf_len = 1;
+    for(i=1; i <= dim; i++)
+    {
+      locals[i - 1] = i + 1;
+      globals[i - 1] = (i + 1) * (i + 1);
+      buf_len *= ((i + 1) * (i + 1));
+    }
+    for(i = dim+1; i <= 3; i++)
+    {
+      globals[i - 1] = 0;
+      locals[i - 1] = 0;
+    }
+
+    // Run the kernel
+    OCL_NDRANGE( dim );
+    clFinish(queue);
+
+    err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int) * buf_len, &local_id, 0, NULL, NULL);
+
+    if (err != CL_SUCCESS)
+    {
+      printf("Error: Failed to read output array! %d\n", err);
+      exit(1);
+    }
+
+#if udebug
+    for(i = 0; i < buf_len; i++)
+    {
+      printf("%2d ", local_id[i]);
+      if ((i + 1) % 4  == 0) printf("\n");
+    }
+#endif
+
+    for( i = 0; i < buf_len; i++)
+      OCL_ASSERT( local_id[i] == i);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_local_id);
diff --git a/utests/builtin_local_size.cpp b/utests/builtin_local_size.cpp
new file mode 100644
index 0000000..a9dac2e
--- /dev/null
+++ b/utests/builtin_local_size.cpp
@@ -0,0 +1,88 @@
+/*
+According to the OpenCL v1.1 & v1.2 chapter 6.11, the behavior of function get_local_size should be as following:
+
+  globals[0] = 3;
+  globals[1] = 4;
+  globals[2] = 5;
+  locals[0] = 3;
+  locals[1] = 4;
+  locals[2] = 5;
+
+get_local_size(-1) = 1 (dimension:1)
+get_local_size(0) = 3 (dimension:1)
+get_local_size(1) = 1 (dimension:1)
+get_local_size(2) = 1 (dimension:1)
+
+get_local_size(-1) = 1 (dimension:2)
+get_local_size(0) = 3 (dimension:2)
+get_local_size(1) = 4 (dimension:2)
+get_local_size(2) = 1 (dimension:2)
+get_local_size(3) = 1 (dimension:2)
+
+get_local_size(-1) = 1 (dimension:3)
+get_local_size(0) = 3 (dimension:3)
+get_local_size(1) = 4 (dimension:3)
+get_local_size(2) = 5 (dimension:3)
+get_local_size(3) = 1 (dimension:3)
+get_local_size(4) = 1 (dimension:3)
+
+*/
+#include "utest_helper.hpp"
+#define udebug 0
+
+static void builtin_local_size(void)
+{
+
+  // Setup kernel and buffers
+  int dim, dim_arg_global, local_size, err;
+  OCL_CREATE_KERNEL("builtin_local_size");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  globals[0] = 3;
+  globals[1] = 4;
+  globals[2] = 5;
+  locals[0] = 3;
+  locals[1] = 4;
+  locals[2] = 5;
+
+  for( dim=1; dim <= 3; dim++ )
+  {
+
+    for( dim_arg_global = -1; dim_arg_global <= dim + 1; dim_arg_global++ )
+    {
+
+      err = clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, sizeof(int), &dim_arg_global, 0, NULL, NULL);
+      if (err != CL_SUCCESS)
+      {
+        printf("Error: Failed to write to source array!\n");
+        exit(1);
+      }
+
+      // Run the kernel
+      OCL_NDRANGE( dim );
+
+      err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &local_size, 0, NULL, NULL);
+      if (err != CL_SUCCESS)
+      {
+        printf("Error: Failed to read output array! %d\n", err);
+        exit(1);
+      }
+
+#if udebug
+      printf("get_local_size(%d) = %d (dimension:%d)\n", dim_arg_global, local_size, dim);
+#endif
+      if ( dim_arg_global >= 0 && dim_arg_global < dim)
+        OCL_ASSERT( local_size == dim_arg_global + 3);
+      else
+      {
+        OCL_ASSERT( local_size == 1);
+      }
+    }
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_local_size);
diff --git a/utests/builtin_mad_sat.cpp b/utests/builtin_mad_sat.cpp
new file mode 100644
index 0000000..ed9a558
--- /dev/null
+++ b/utests/builtin_mad_sat.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+void builtin_mad_sat(void)
+{
+  const int n = 32;
+  short src1[n], src2[n], src3[n];
+srand(0);
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_mad_sat");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(short), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((short*)buf_data[0])[i] = rand();
+    src2[i] = ((short*)buf_data[1])[i] = rand();
+    src3[i] = ((short*)buf_data[2])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(3);
+  for (int i = 0; i < n; ++i) {
+    int a = (int)src1[i] * (int)src2[i] + (int)src3[i];
+    a = a > 0x7FFF ? 0x7FFF : (a < -0x8000 ? -0x8000 : a);
+    OCL_ASSERT(((short*)buf_data[3])[i] == (short)a);
+  }
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_mad_sat);
diff --git a/utests/builtin_modf.cpp b/utests/builtin_modf.cpp
new file mode 100644
index 0000000..057e95e
--- /dev/null
+++ b/utests/builtin_modf.cpp
@@ -0,0 +1,56 @@
+#include <cmath>
+#include <cstring>
+#include "utest_helper.hpp"
+
+void builtin_modf(void)
+{
+  const int n = 32;
+  float src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_modf");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  src[0] = INFINITY;
+  src[1] = -INFINITY;
+  src[2] = nanf("");
+  src[3] = 0;
+  src[4] = 1.5f;
+  src[5] = 2.5f;
+  src[6] = -2.5f;
+  src[7] = 20;
+  src[8] = 21;
+  src[9] = 89.5f;
+
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], src, n * sizeof(float));
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  float *dst = (float *)buf_data[1];
+  float *it = (float *)buf_data[2];
+  OCL_ASSERT(dst[0] == 0 && it[0] == INFINITY);
+  OCL_ASSERT(dst[1] == -0.f && it[1] == -INFINITY);
+  OCL_ASSERT(isnanf(dst[2]) && isnanf(it[2]));
+  OCL_ASSERT(dst[3] == 0 && it[3] == 0);
+  OCL_ASSERT(dst[4] == 0.5f && it[4] == 1);
+  OCL_ASSERT(dst[5] == 0.5f && it[5] == 2);
+  OCL_ASSERT(dst[6] == -0.5f && it[6] == -2);
+  OCL_ASSERT(dst[7] == 0 && it[7] == 20);
+  OCL_ASSERT(dst[8] == 0 && it[8] == 21);
+  OCL_ASSERT(dst[9] == 0.5f && it[9] == 89);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_modf);
diff --git a/utests/builtin_nextafter.cpp b/utests/builtin_nextafter.cpp
new file mode 100644
index 0000000..ae95497
--- /dev/null
+++ b/utests/builtin_nextafter.cpp
@@ -0,0 +1,60 @@
+#include <cmath>
+#include <cstring>
+#include "utest_helper.hpp"
+
+static int as_int(float f) {
+  void *p = &f;
+  return *(int *)p;
+}
+
+void builtin_nextafter(void)
+{
+  const int n = 16;
+  float src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_nextafter");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  src1[0] = nanf(""), src2[0] = 1.1f;
+  src1[1] = 2.2f,     src2[1] = nanf("");
+  src1[2] = nanf(""), src2[2] = nanf("");
+  src1[3] = 123.4f,   src2[3] = 123.4f;
+  src1[4] = 0.f,      src2[4] = 1.f;
+  src1[5] = -0.f,     src2[5] = -1.f;
+  for (int i = 6; i < n; ++i) {
+    src1[i] = (rand() & 255) * 0.1f - 12.8f;
+    src2[i] = (rand() & 255) * 0.1f - 12.8f;
+  }
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, n * sizeof(float));
+  memcpy(buf_data[1], src2, n * sizeof(float));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  float *dest = (float *)buf_data[2];
+  if (0)
+    for (int i = 0; i < n; ++i)
+      printf("%d %x %x %x %x\n", i, as_int(src1[i]), as_int(src2[i]),
+             as_int(dest[i]), as_int(nextafterf(src1[i], src2[i])));
+  OCL_ASSERT(isnanf(dest[0]));
+  OCL_ASSERT(isnanf(dest[1]));
+  OCL_ASSERT(isnanf(dest[2]));
+  for (int i = 3; i < n; ++i)
+    OCL_ASSERT(dest[i] == nextafterf(src1[i], src2[i]));
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_nextafter);
diff --git a/utests/builtin_num_groups.cpp b/utests/builtin_num_groups.cpp
new file mode 100644
index 0000000..bbff435
--- /dev/null
+++ b/utests/builtin_num_groups.cpp
@@ -0,0 +1,85 @@
+/*
+According to the OpenCL v1.1 & v1.2 chapter 6.11, the behavior of function get_num_groups should be as following:
+
+  globals[0] = 1;
+  globals[1] = 4;
+  globals[2] = 9;
+  locals[0] = 1;
+  locals[1] = 2;
+  locals[2] = 3;
+
+#ifdef CL_VERSION_1_2 | CL_VERSION_1_1:
+get_num_groups(-1) = 1 (dimension:1)
+get_num_groups(0) = 1 (dimension:1)
+get_num_groups(1) = 1 (dimension:1)
+
+get_num_groups(-1) = 1 (dimension:2)
+get_num_groups(0) = 1 (dimension:2)
+get_num_groups(1) = 2 (dimension:2)
+get_num_groups(2) = 1 (dimension:2)
+
+get_num_groups(-1) = 1 (dimension:3)
+get_num_groups(0) = 1 (dimension:3)
+get_num_groups(1) = 2 (dimension:3)
+get_num_groups(2) = 3 (dimension:3)
+get_num_groups(3) = 1 (dimension:3)
+*/
+
+#define udebug 0
+#include "utest_helper.hpp"
+static void builtin_num_groups(void)
+{
+
+  // Setup kernel and buffers
+  int dim, dim_arg_global, num_groups, err;
+  OCL_CREATE_KERNEL("builtin_num_groups");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  globals[0] = 1;
+  globals[1] = 4;
+  globals[2] = 9;
+  locals[0] = 1;
+  locals[1] = 2;
+  locals[2] = 3;
+
+  for( dim=1; dim <= 3; dim++ )
+  {
+
+    for( dim_arg_global = -1; dim_arg_global <= dim + 1; dim_arg_global++ )
+    {
+
+      err = clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, sizeof(int), &dim_arg_global, 0, NULL, NULL);
+      if (err != CL_SUCCESS)
+      {
+        printf("Error: Failed to write to source array!\n");
+        exit(1);
+      }
+
+      // Run the kernel
+      OCL_NDRANGE( dim );
+
+      err = clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(int), &num_groups, 0, NULL, NULL);
+      if (err != CL_SUCCESS)
+      {
+        printf("Error: Failed to read output array! %d\n", err);
+        exit(1);
+      }
+
+#if udebug
+      printf("get_num_groups(%d) = %d (dimension:%d)\n", dim_arg_global, num_groups, dim);
+#endif
+      if ( dim_arg_global >= 0 && dim_arg_global < dim)
+        OCL_ASSERT( num_groups == dim_arg_global + 1 );
+      else
+      {
+        OCL_ASSERT( num_groups == 1);
+      }
+    }
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_num_groups);
diff --git a/utests/builtin_pow.cpp b/utests/builtin_pow.cpp
new file mode 100644
index 0000000..8ed17ed
--- /dev/null
+++ b/utests/builtin_pow.cpp
@@ -0,0 +1,92 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+#define udebug 0
+#define printf_c(...) \
+{\
+  printf("\033[1m\033[40;31m");\
+  printf( __VA_ARGS__ );\
+  printf("\033[0m");\
+}
+const float ori_data[] = {-20.5, -1, -0.9, -0.01, 0, 0.01, 0.9, 1.0, 20.5};
+const int count_input_ori = sizeof(ori_data) / sizeof(ori_data[0]);
+const int count_input = count_input_ori * count_input_ori;
+
+float input_data1[count_input];
+float input_data2[count_input];
+const int max_function = 1;
+
+static void cpu_compiler_math(const float *src1, const float *src2, float *dst)
+{
+  dst[0] = powf(src1[0], src2[0]);
+//  dst[1] = src1[0];
+}
+
+static void builtin_pow(void)
+{
+  // Setup kernel and buffers
+  int k, i, index_cur;
+  float gpu_data[max_function * count_input] = {0}, cpu_data[max_function * count_input] = {0};
+
+  for(i=0; i<count_input_ori;i++)
+    for(k=0; k<count_input_ori;k++)
+    {
+      input_data1[i*count_input_ori+k] = ori_data[i];
+      input_data2[i*count_input_ori+k] = ori_data[k];
+    }
+
+  OCL_CREATE_KERNEL("builtin_pow");
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], CL_MEM_READ_WRITE, sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+
+  globals[0] = count_input;
+  locals[0] = 1;
+
+  clEnqueueWriteBuffer( queue, buf[1], CL_TRUE, 0, count_input * sizeof(float), input_data1, 0, NULL, NULL);
+  clEnqueueWriteBuffer( queue, buf[2], CL_TRUE, 0, count_input * sizeof(float), input_data2, 0, NULL, NULL);
+  clEnqueueWriteBuffer( queue, buf[3], CL_TRUE, 0, sizeof(int), &max_function, 0, NULL, NULL);
+
+   // Run the kernel
+  OCL_NDRANGE( 1 );
+
+  clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(float) * max_function * count_input, gpu_data, 0, NULL, NULL);
+
+  for (k = 0; (uint)k < count_input; k++)
+  {
+    cpu_compiler_math( input_data1 + k, input_data2 + k, cpu_data + k * max_function);
+
+    for (i = 0; i < max_function; i++)
+    {
+      index_cur = k * max_function + i;
+#if udebug
+      if ( (isinf(cpu_data[index_cur]) && !isinf(gpu_data[index_cur])) ||
+           (isnan(cpu_data[index_cur]) && !isnan(gpu_data[index_cur])) ||
+           (fabs(gpu_data[index_cur] - cpu_data[index_cur]) > 1e-5f)   )
+      {
+        printf_c("%d/%d: x:%f, y:%f -> gpu:%f  cpu:%f\n", k, i, input_data1[k], input_data2[k], gpu_data[index_cur], cpu_data[index_cur]);
+      }
+      else
+        printf("%d/%d: x:%f, y:%f -> gpu:%f  cpu:%f\n", k, i, input_data1[k], input_data2[k], gpu_data[index_cur], cpu_data[index_cur]);
+#else
+     if (isinf(cpu_data[index_cur]))
+       OCL_ASSERT(isinf(gpu_data[index_cur]));
+     else if (isnan(cpu_data[index_cur]))
+       OCL_ASSERT(isnan(gpu_data[index_cur]));
+     else
+     {
+       OCL_ASSERT(fabs(gpu_data[index_cur] - cpu_data[index_cur]) < 1e-3f);
+     }
+#endif
+    }
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(builtin_pow)
diff --git a/utests/builtin_remquo.cpp b/utests/builtin_remquo.cpp
new file mode 100644
index 0000000..f67be12
--- /dev/null
+++ b/utests/builtin_remquo.cpp
@@ -0,0 +1,65 @@
+#include <cmath>
+#include <cstring>
+#include "utest_helper.hpp"
+
+void builtin_remquo(void)
+{
+  const int n = 16;
+  float src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_remquo");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  src1[0] = 1,         src2[0] = 0;
+  src1[1] = 1,         src2[1] = -0.f;
+  src1[2] = INFINITY,  src2[2] = 1;
+  src1[3] = -INFINITY, src2[3] = 1;
+  src1[4] = nanf(""),  src2[4] = nanf("");
+  src1[5] = 1.625f,    src2[5] = 1;
+  src1[6] = -1.625f,   src2[6] = 1;
+  src1[7] = 1.625f,    src2[7] = -1;
+  src1[8] = -1.625f,   src2[8] = -1;
+  src1[9] = 5,         src2[9] = 2;
+  src1[10] = 3,        src2[10] = 2;
+  src1[11] = -0.f,     src2[11] = 1;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, n * sizeof(float));
+  memcpy(buf_data[1], src2, n * sizeof(float));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  float *dest = (float *)buf_data[2];
+  int *quo = (int *)buf_data[3];
+  OCL_ASSERT(isnanf(dest[0]));
+  OCL_ASSERT(isnanf(dest[1]));
+  OCL_ASSERT(isnanf(dest[2]));
+  OCL_ASSERT(isnanf(dest[3]));
+  OCL_ASSERT(isnanf(dest[4]));
+  OCL_ASSERT(dest[5] == -0.375f && quo[5] ==  2);
+  OCL_ASSERT(dest[6] ==  0.375f && quo[6] == -2);
+  OCL_ASSERT(dest[7] == -0.375f && quo[7] == -2);
+  OCL_ASSERT(dest[8] ==  0.375f && quo[8] ==  2);
+  OCL_ASSERT(dest[9] == 1       && quo[9] ==  2);
+  OCL_ASSERT(dest[10] == -1     && quo[10] == 2);
+  OCL_ASSERT(dest[11] == -0.f   && quo[11] == 0);
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_remquo);
diff --git a/utests/builtin_shuffle.cpp b/utests/builtin_shuffle.cpp
new file mode 100644
index 0000000..c7fa86b
--- /dev/null
+++ b/utests/builtin_shuffle.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+void builtin_shuffle(void)
+{
+  const int n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_shuffle");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; i ++) {
+    ((float *)(buf_data[0]))[i] = rand();
+    ((float *)(buf_data[1]))[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  for (int i = 0; i < n; i ++) {
+    OCL_ASSERT(((float *)(buf_data[0]))[i] == ((float *)(buf_data[3]))[i]);
+    OCL_ASSERT(((float *)(buf_data[1]))[i] == ((float *)(buf_data[2]))[i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_shuffle);
diff --git a/utests/builtin_shuffle2.cpp b/utests/builtin_shuffle2.cpp
new file mode 100644
index 0000000..7a9ebd1
--- /dev/null
+++ b/utests/builtin_shuffle2.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+void builtin_shuffle2(void)
+{
+  const int n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_shuffle2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; i ++) {
+    ((float *)(buf_data[0]))[i] = (rand() & 15) * 0.1f;
+    ((float *)(buf_data[1]))[i] = (rand() & 15) * 0.1f;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  for (int i = 0; i < n; i ++) {
+    OCL_ASSERT(2 * ((float *)(buf_data[0]))[i] == ((float *)(buf_data[3]))[i]);
+    OCL_ASSERT(2 * ((float *)(buf_data[1]))[i] == ((float *)(buf_data[2]))[i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_shuffle2);
diff --git a/utests/builtin_sign.cpp b/utests/builtin_sign.cpp
new file mode 100644
index 0000000..426de36
--- /dev/null
+++ b/utests/builtin_sign.cpp
@@ -0,0 +1,47 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_sign(void)
+{
+  const int n = 32;
+  float src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_sign");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  src[0] = ((float*)buf_data[0])[0] = nanf("");
+  src[1] = ((float*)buf_data[0])[1] = INFINITY;
+  src[2] = ((float*)buf_data[0])[2] = 0.f;
+  src[3] = ((float*)buf_data[0])[3] = -0.f;
+  for (int i = 4; i < n; ++i) {
+    src[i] = ((float*)buf_data[0])[i] = (rand() & 15) * 0.1 - 0.75;
+  }
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  float *dst = (float*)buf_data[1];
+  OCL_ASSERT(dst[0] == 0);
+  OCL_ASSERT(dst[1] == 1.f);
+  OCL_ASSERT(dst[2] == 0.f);
+  OCL_ASSERT(dst[3] == -0.f);
+  for (int i = 4; i < n; ++i) {
+    if (src[i] == 0.f)
+      OCL_ASSERT(dst[i] == 0.f);
+    else if (src[i] == -0.f)
+      OCL_ASSERT(dst[i] == -0.f);
+    else
+      OCL_ASSERT(dst[i] == (src[i] > 0 ? 1 : -1));
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_sign);
diff --git a/utests/builtin_sinpi.cpp b/utests/builtin_sinpi.cpp
new file mode 100644
index 0000000..0e11a0d
--- /dev/null
+++ b/utests/builtin_sinpi.cpp
@@ -0,0 +1,104 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static int as_int(float x) {
+  union {float f; int i;} u;
+  u.f = x;
+  return u.i;
+}
+
+static float sinpi(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  float y, z;
+  int n = 0, ix;
+  const float pi = 3.1415927410e+00f;
+
+  ix = as_int(x) & 0x7fffffff;
+
+  if (ix < 0x3e800000)
+    return sinf(pi * x);
+  y = -x;
+  z = floorf(y);
+  if (z != y) {
+    y *= 0.5f;
+    y = 2.f * (y - floorf(y));
+    n = y * 4.f;
+  } else {
+    if (ix >= 0x4b800000) {
+      y = 0;
+      n = 0;
+    } else {
+      if (ix < 0x4b000000)
+        z = y + 8.3886080000e+06f;
+      int n = as_int(z);
+      n &= 1;
+      y = n;
+      n <<= 2;
+    }
+  }
+  switch (n) {
+  case 0:
+    y = sinf(pi * y);
+    break;
+  case 1:
+  case 2:
+    y = cosf(pi * ((float) 0.5 - y));
+    break;
+  case 3:
+  case 4:
+    y = sinf(pi * (1.f - y));
+    break;
+  case 5:
+  case 6:
+    y = -cosf(pi * (y - (float) 1.5));
+    break;
+  default:
+    y = sinf(pi * (y - (float) 2.0));
+    break;
+  }
+  return -y;
+}
+
+void builtin_sinpi(void)
+{
+  const int n = 1024;
+  float src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_sinpi");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int j = 0; j < 1000; j ++) {
+    OCL_MAP_BUFFER(0);
+    for (int i = 0; i < n; ++i) {
+      src[i] = ((float*)buf_data[0])[i] = (j*n + i) * 0.01f;
+    }
+    OCL_UNMAP_BUFFER(0);
+
+    OCL_NDRANGE(1);
+
+    OCL_MAP_BUFFER(1);
+    float *dst = (float*)buf_data[1];
+    for (int i = 0; i < n; ++i) {
+      float cpu = sinpi(src[i]);
+      OCL_ASSERT (fabsf(cpu - dst[i]) < 1e-4);
+    }
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_sinpi);
diff --git a/utests/builtin_tgamma.cpp b/utests/builtin_tgamma.cpp
new file mode 100644
index 0000000..4c824d0
--- /dev/null
+++ b/utests/builtin_tgamma.cpp
@@ -0,0 +1,42 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void builtin_tgamma(void)
+{
+  const int n = 1024;
+  float src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("builtin_tgamma");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  for (int j = 0; j < 1024; j ++) {
+    OCL_MAP_BUFFER(0);
+    for (int i = 0; i < n; ++i) {
+      src[i] = ((float*)buf_data[0])[i] = (j*n+i+1) * 0.001f;
+    }
+    OCL_UNMAP_BUFFER(0);
+
+    OCL_NDRANGE(1);
+
+    OCL_MAP_BUFFER(1);
+    float *dst = (float*)buf_data[1];
+    for (int i = 0; i < n; ++i) {
+      float cpu = gammaf(src[i]);
+      if (isinf(cpu)) {
+        OCL_ASSERT(isinf(dst[i]));
+      } else if (fabsf(cpu - dst[i]) >= 1e-3) {
+        printf("%f %f %f\n", src[i], cpu, dst[i]);
+        OCL_ASSERT(0);
+      }
+    }
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_tgamma);
diff --git a/utests/cl_create_kernel.cpp b/utests/cl_create_kernel.cpp
new file mode 100644
index 0000000..36a7c38
--- /dev/null
+++ b/utests/cl_create_kernel.cpp
@@ -0,0 +1,16 @@
+#include "utest_helper.hpp"
+
+static void test_create_kernel(void)
+{
+  cl_ulong max_mem_size;
+  cl_int status;
+
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_size), &max_mem_size, NULL);
+  OCL_ASSERT(max_mem_size < (cl_ulong)-1);
+  // increment the size so that following clCreateBuffer() would fail.
+  ++max_mem_size;
+  buf[0] = clCreateBuffer(ctx, 0, max_mem_size, NULL, &status);
+  OCL_ASSERT(status == CL_INVALID_BUFFER_SIZE);
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_create_kernel);
diff --git a/utests/compare_image_2d_and_1d_array.cpp b/utests/compare_image_2d_and_1d_array.cpp
new file mode 100644
index 0000000..f2c828e
--- /dev/null
+++ b/utests/compare_image_2d_and_1d_array.cpp
@@ -0,0 +1,79 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compare_image_2d_and_1d_array(void)
+{
+  const int w = 64;
+  const int h = 32;
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+
+  // Create the 1D array buffer.
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  uint32_t* image_data1 = (uint32_t *)malloc(w * h * sizeof(uint32_t));
+  uint32_t* image_data2 = (uint32_t *)malloc(w * h * sizeof(uint32_t));
+  for (int j = 0; j < h; j++) {
+    for (int i = 0; i < w; i++) {
+      char a = 0;
+      if (j % 2 == 0)
+        a = (j + 3) & 0x3f;
+
+      image_data2[w * j + i] = image_data1[w * j + i] = a << 24 | a << 16 | a << 8 | a;
+    }
+  }
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = w * sizeof(uint32_t);
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data1);
+
+  // Create the 2D array buffer.
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+  desc.image_width = w;
+  desc.image_array_size = h;
+  desc.image_row_pitch = w * sizeof(uint32_t);
+  OCL_CREATE_IMAGE(buf[1], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data2);
+
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_LINEAR);
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("compare_image_2d_and_1d_array");
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_sampler), &sampler);
+  globals[0] = 32;
+  globals[1] = 16;
+  locals[0] = 32;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; i++) {
+      // Because the array index will not join the sample caculation, the result should
+      // be different between the 2D and 1D_array.
+      if (j % 2 == 0)
+        OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[1])[j * w + i]);
+    }
+  }
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
+
+  OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compare_image_2d_and_1d_array);
diff --git a/utests/compiler_abs.cpp b/utests/compiler_abs.cpp
new file mode 100644
index 0000000..3f477a8
--- /dev/null
+++ b/utests/compiler_abs.cpp
@@ -0,0 +1,254 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+template <typename T, int N>
+struct cl_vec {
+    T ptr[((N+1)/2)*2]; //align to 2 elements.
+
+    typedef cl_vec<T, N> vec_type;
+
+    cl_vec(void) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+    }
+    cl_vec(vec_type & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+    }
+
+    vec_type& operator= (vec_type & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+        return *this;
+    }
+
+    template <typename U> vec_type& operator= (cl_vec<U, N> & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+        return *this;
+    }
+
+    bool operator== (vec_type & other) {
+        return !memcmp (this->ptr, other.ptr, sizeof(T) * N);
+    }
+
+    void abs(void) {
+        int i = 0;
+        for (; i < N; i++) {
+            T f = ptr[i];
+            f = f < 0 ? -f : f;
+            ptr[i] = f;
+        }
+    }
+};
+
+template <typename T, typename U, int N> static void cpu (int global_id,
+        cl_vec<T, N> *src, cl_vec<U, N> *dst)
+{
+    cl_vec<T, N> v  = src[global_id];
+    v.abs();
+    dst[global_id] = v;
+}
+
+template <typename T, typename U> static void cpu(int global_id, T *src, U *dst)
+{
+    T f = src[global_id];
+    f = f < 0 ? -f : f;
+    dst[global_id] = (U)f;
+}
+
+template <typename T, int N> static void gen_rand_val (cl_vec<T, N>& vect)
+{
+    int i = 0;
+
+    memset(vect.ptr, 0, sizeof(T) * ((N+1)/2)*2);
+    for (; i < N; i++) {
+        vect.ptr[i] = static_cast<T>((rand() & 63) - 32);
+    }
+}
+
+template <typename T> static void gen_rand_val (T & val)
+{
+    val = static_cast<T>((rand() & 63) - 32);
+}
+
+template <typename T>
+inline static void print_data (T& val)
+{
+    if (std::is_unsigned<T>::value)
+        printf(" %u", val);
+    else
+        printf(" %d", val);
+}
+
+template <typename T, typename U, int N> static void dump_data (cl_vec<T, N>* src,
+        cl_vec<U, N>* dst, int n)
+{
+    U* val = reinterpret_cast<U *>(dst);
+
+    n = n*((N+1)/2)*2;
+
+    printf("\nRaw: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+
+    printf("\nCPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(val[i]);
+    }
+    printf("\nGPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((U *)buf_data[1])[i]);
+    }
+}
+
+template <typename T, typename U> static void dump_data (T* src, U* dst, int n)
+{
+    printf("\nRaw: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+
+    printf("\nCPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(dst[i]);
+    }
+    printf("\nGPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((U *)buf_data[1])[i]);
+    }
+}
+
+template <typename T, typename U> static void compiler_abs_with_type(void)
+{
+    const size_t n = 16;
+    U cpu_dst[16];
+    T cpu_src[16];
+
+    // Setup buffers
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    globals[0] = 16;
+    locals[0] = 16;
+
+    // Run random tests
+    for (uint32_t pass = 0; pass < 8; ++pass) {
+        OCL_MAP_BUFFER(0);
+
+        /* Clear the dst buffer to avoid random data. */
+        OCL_MAP_BUFFER(1);
+        memset(buf_data[1], 0, sizeof(U) * n);
+        OCL_UNMAP_BUFFER(1);
+
+        for (int32_t i = 0; i < (int32_t) n; ++i) {
+            gen_rand_val(cpu_src[i]);
+        }
+
+        memcpy(buf_data[0], cpu_src, sizeof(T) * n);
+
+        // Run the kernel on GPU
+        OCL_NDRANGE(1);
+
+        // Run on CPU
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            cpu(i, cpu_src, cpu_dst);
+
+        // Compare
+        OCL_MAP_BUFFER(1);
+
+//      dump_data(cpu_src, cpu_dst, n);
+
+        OCL_ASSERT(!memcmp(buf_data[1], cpu_dst, sizeof(T) * n));
+        OCL_UNMAP_BUFFER(1);
+        OCL_UNMAP_BUFFER(0);
+    }
+}
+
+#define ABS_TEST_TYPE_1(TYPE, UTYPE, KEEP_PROGRAM) \
+	static void compiler_abs_##TYPE (void) \
+        { \
+           OCL_CALL (cl_kernel_init, "compiler_abs.cl", "compiler_abs_"#TYPE, SOURCE, NULL);  \
+           compiler_abs_with_type<TYPE, UTYPE>(); \
+        } \
+	MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_abs_##TYPE, KEEP_PROGRAM);
+
+#define ABS_TEST_TYPE(TYPE, UTYPE) ABS_TEST_TYPE_1(TYPE, UTYPE, true)
+#define ABS_TEST_TYPE_END(TYPE, UTYPE) ABS_TEST_TYPE_1(TYPE, UTYPE, false)
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+ABS_TEST_TYPE(int, uint)
+ABS_TEST_TYPE(short, ushort)
+ABS_TEST_TYPE(char, uchar)
+ABS_TEST_TYPE(uint, uint)
+ABS_TEST_TYPE(ushort, ushort)
+ABS_TEST_TYPE(uchar, uchar)
+
+
+typedef cl_vec<int, 2> int2;
+typedef cl_vec<int, 3> int3;
+typedef cl_vec<int, 4> int4;
+typedef cl_vec<int, 8> int8;
+typedef cl_vec<int, 16> int16;
+typedef cl_vec<unsigned int, 2> uint2;
+typedef cl_vec<unsigned int, 3> uint3;
+typedef cl_vec<unsigned int, 4> uint4;
+typedef cl_vec<unsigned int, 8> uint8;
+typedef cl_vec<unsigned int, 16> uint16;
+ABS_TEST_TYPE(int2, uint2)
+ABS_TEST_TYPE(int3, uint3)
+ABS_TEST_TYPE(int4, uint4)
+ABS_TEST_TYPE(int8, uint8)
+ABS_TEST_TYPE(int16, uint16)
+ABS_TEST_TYPE(uint2, uint2)
+ABS_TEST_TYPE(uint3, uint3)
+ABS_TEST_TYPE(uint4, uint4)
+ABS_TEST_TYPE(uint8, uint8)
+ABS_TEST_TYPE(uint16, uint16)
+
+
+typedef cl_vec<char, 2> char2;
+typedef cl_vec<char, 3> char3;
+typedef cl_vec<char, 4> char4;
+typedef cl_vec<char, 8> char8;
+typedef cl_vec<char, 16> char16;
+typedef cl_vec<unsigned char, 2> uchar2;
+typedef cl_vec<unsigned char, 3> uchar3;
+typedef cl_vec<unsigned char, 4> uchar4;
+typedef cl_vec<unsigned char, 8> uchar8;
+typedef cl_vec<unsigned char, 16> uchar16;
+ABS_TEST_TYPE(char2, uchar2)
+ABS_TEST_TYPE(char3, uchar3)
+ABS_TEST_TYPE(char4, uchar4)
+ABS_TEST_TYPE(char8, uchar8)
+ABS_TEST_TYPE(char16, uchar16)
+ABS_TEST_TYPE(uchar2, uchar2)
+ABS_TEST_TYPE(uchar3, uchar3)
+ABS_TEST_TYPE(uchar4, uchar4)
+ABS_TEST_TYPE(uchar8, uchar8)
+ABS_TEST_TYPE(uchar16, uchar16)
+
+
+typedef cl_vec<short, 2> short2;
+typedef cl_vec<short, 3> short3;
+typedef cl_vec<short, 4> short4;
+typedef cl_vec<short, 8> short8;
+typedef cl_vec<short, 16> short16;
+typedef cl_vec<unsigned short, 2> ushort2;
+typedef cl_vec<unsigned short, 3> ushort3;
+typedef cl_vec<unsigned short, 4> ushort4;
+typedef cl_vec<unsigned short, 8> ushort8;
+typedef cl_vec<unsigned short, 16> ushort16;
+ABS_TEST_TYPE(short2, ushort2)
+ABS_TEST_TYPE(short3, ushort3)
+ABS_TEST_TYPE(short4, ushort4)
+ABS_TEST_TYPE(short8, ushort8)
+ABS_TEST_TYPE(short16, ushort16)
+ABS_TEST_TYPE(ushort2, ushort2)
+ABS_TEST_TYPE(ushort3, ushort3)
+ABS_TEST_TYPE(ushort4, ushort4)
+ABS_TEST_TYPE(ushort8, ushort8)
+ABS_TEST_TYPE_END(ushort16, ushort16)
diff --git a/utests/compiler_abs_diff.cpp b/utests/compiler_abs_diff.cpp
new file mode 100644
index 0000000..15a1f90
--- /dev/null
+++ b/utests/compiler_abs_diff.cpp
@@ -0,0 +1,295 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+template <typename T, int N>
+struct cl_vec {
+    T ptr[((N+1)/2)*2]; //align to 2 elements.
+
+    typedef cl_vec<T, N> vec_type;
+
+    cl_vec(void) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+    }
+    cl_vec(vec_type & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+    }
+
+    vec_type& operator= (vec_type & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+        return *this;
+    }
+
+    template <typename U> vec_type& operator= (cl_vec<U, N> & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+        return *this;
+    }
+
+    bool operator== (vec_type & other) {
+        return !memcmp (this->ptr, other.ptr, sizeof(T) * N);
+    }
+
+    void abs_diff(vec_type & other) {
+        int i = 0;
+        for (; i < N; i++) {
+            T a = ptr[i];
+            T b = other.ptr[i];
+            T f = a > b ? (a - b) : (b - a);
+            ptr[i] = f;
+        }
+    }
+};
+
+template <typename T, typename U, int N> static void cpu (int global_id,
+        cl_vec<T, N> *x, cl_vec<T, N> *y, cl_vec<U, N> *diff)
+{
+    cl_vec<T, N> v  = x[global_id];
+    v.abs_diff(y[global_id]);
+    diff[global_id] = v;
+}
+
+template <typename T, typename U> static void cpu(int global_id, T *x, T *y, U *diff)
+{
+    T a = x[global_id];
+    T b = y[global_id];
+    U f = a > b ? (a - b) : (b - a);
+    diff[global_id] = f;
+}
+
+template <typename T, int N> static void gen_rand_val (cl_vec<T, N>& vect)
+{
+    int i = 0;
+    for (; i < N; i++) {
+        vect.ptr[i] = static_cast<T>((rand() & 63) - 32);
+    }
+}
+
+template <typename T> static void gen_rand_val (T & val)
+{
+    val = static_cast<T>((rand() & 63) - 32);
+}
+
+template <typename T>
+inline static void print_data (T& val)
+{
+    if (std::is_unsigned<T>::value)
+        printf(" %u", val);
+    else
+        printf(" %d", val);
+}
+
+template <typename T, typename U, int N> static void dump_data (cl_vec<T, N>* x,
+        cl_vec<T, N>* y, cl_vec<U, N>* diff, int n)
+{
+    U* val = reinterpret_cast<U *>(diff);
+
+    n = n*((N+1)/2)*2;
+
+    printf("\nRaw x: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+    printf("\nRaw y: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[1])[i]);
+    }
+
+    printf("\nCPU diff: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(val[i]);
+    }
+    printf("\nGPU diff: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((U *)buf_data[2])[i]);
+    }
+}
+
+template <typename T, typename U> static void dump_data (T* x, T* y, U* diff, int n)
+{
+    printf("\nRaw x: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+    printf("\nRaw y: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[1])[i]);
+    }
+
+    printf("\nCPU diff: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(diff[i]);
+    }
+    printf("\nGPU diff: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((U *)buf_data[2])[i]);
+    }
+}
+
+template <typename T, typename U> static void compiler_abs_diff_with_type(void)
+{
+    const size_t n = 16;
+    U cpu_diff[16];
+    T cpu_x[16];
+    T cpu_y[16];
+
+    // Setup buffers
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+    OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(U), NULL);
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+    globals[0] = 16;
+    locals[0] = 16;
+
+    // Run random tests
+    for (uint32_t pass = 0; pass < 8; ++pass) {
+        OCL_MAP_BUFFER(0);
+        OCL_MAP_BUFFER(1);
+
+        /* Clear the dst buffer to avoid random data. */
+        OCL_MAP_BUFFER(2);
+        memset(buf_data[2], 0, sizeof(U) * n);
+        OCL_UNMAP_BUFFER(2);
+
+        for (int32_t i = 0; i < (int32_t) n; ++i) {
+            gen_rand_val(cpu_x[i]);
+            gen_rand_val(cpu_y[i]);
+        }
+
+        memcpy(buf_data[0], cpu_x, sizeof(T) * n);
+        memcpy(buf_data[1], cpu_y, sizeof(T) * n);
+
+        // Run the kernel on GPU
+        OCL_NDRANGE(1);
+
+        // Run on CPU
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            cpu(i, cpu_x, cpu_y, cpu_diff);
+
+        // Compare
+        OCL_MAP_BUFFER(2);
+
+//      dump_data(cpu_x, cpu_y, cpu_diff, n);
+
+        OCL_ASSERT(!memcmp(buf_data[2], cpu_diff, sizeof(T) * n));
+
+        OCL_UNMAP_BUFFER(0);
+        OCL_UNMAP_BUFFER(1);
+        OCL_UNMAP_BUFFER(2);
+    }
+}
+
+
+#define ABS_TEST_DIFF_TYPE_2(TYPE, CLTYPE, UTYPE, KEEP_PROGRAM) \
+	static void compiler_abs_diff_##CLTYPE (void) \
+        { \
+           OCL_CALL (cl_kernel_init, "compiler_abs_diff.cl", "compiler_abs_diff_"#CLTYPE, SOURCE, NULL);  \
+           compiler_abs_diff_with_type<TYPE, UTYPE>(); \
+        } \
+	MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_abs_diff_##CLTYPE, KEEP_PROGRAM);
+
+#define ABS_TEST_DIFF_TYPE(TYPE, UTYPE) ABS_TEST_DIFF_TYPE_2(TYPE, TYPE, UTYPE, true)
+
+#define ABS_TEST_DIFF_TYPE_END(TYPE, UTYPE) ABS_TEST_DIFF_TYPE_2(TYPE, TYPE, UTYPE, false)
+
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef uint64_t ulong64;
+ABS_TEST_DIFF_TYPE(int, uint)
+ABS_TEST_DIFF_TYPE_2(int64_t, long, ulong64, true)
+ABS_TEST_DIFF_TYPE(short, ushort)
+ABS_TEST_DIFF_TYPE(char, uchar)
+ABS_TEST_DIFF_TYPE(uint, uint)
+ABS_TEST_DIFF_TYPE_2(ulong64, ulong, ulong64, true)
+ABS_TEST_DIFF_TYPE(ushort, ushort)
+ABS_TEST_DIFF_TYPE(uchar, uchar)
+
+typedef cl_vec<int, 2> int2;
+typedef cl_vec<int, 3> int3;
+typedef cl_vec<int, 4> int4;
+typedef cl_vec<int, 8> int8;
+typedef cl_vec<int, 16> int16;
+typedef cl_vec<unsigned int, 2> uint2;
+typedef cl_vec<unsigned int, 3> uint3;
+typedef cl_vec<unsigned int, 4> uint4;
+typedef cl_vec<unsigned int, 8> uint8;
+typedef cl_vec<unsigned int, 16> uint16;
+ABS_TEST_DIFF_TYPE(int2, uint2)
+ABS_TEST_DIFF_TYPE(int3, uint3)
+ABS_TEST_DIFF_TYPE(int4, uint4)
+ABS_TEST_DIFF_TYPE(int8, uint8)
+ABS_TEST_DIFF_TYPE(int16, uint16)
+ABS_TEST_DIFF_TYPE(uint2, uint2)
+ABS_TEST_DIFF_TYPE(uint3, uint3)
+ABS_TEST_DIFF_TYPE(uint4, uint4)
+ABS_TEST_DIFF_TYPE(uint8, uint8)
+ABS_TEST_DIFF_TYPE(uint16, uint16)
+
+typedef cl_vec<int64_t, 2> long2;
+typedef cl_vec<int64_t, 3> long3;
+typedef cl_vec<int64_t, 4> long4;
+typedef cl_vec<int64_t, 8> long8;
+typedef cl_vec<int64_t, 16> long16;
+typedef cl_vec<uint64_t, 2> ulong2;
+typedef cl_vec<uint64_t, 3> ulong3;
+typedef cl_vec<uint64_t, 4> ulong4;
+typedef cl_vec<uint64_t, 8> ulong8;
+typedef cl_vec<uint64_t, 16> ulong16;
+ABS_TEST_DIFF_TYPE(long2, ulong2)
+ABS_TEST_DIFF_TYPE(long3, ulong3)
+ABS_TEST_DIFF_TYPE(long4, ulong4)
+ABS_TEST_DIFF_TYPE(long8, ulong8)
+ABS_TEST_DIFF_TYPE(long16, ulong16)
+ABS_TEST_DIFF_TYPE(ulong2, ulong2)
+ABS_TEST_DIFF_TYPE(ulong3, ulong3)
+ABS_TEST_DIFF_TYPE(ulong4, ulong4)
+ABS_TEST_DIFF_TYPE(ulong8, ulong8)
+ABS_TEST_DIFF_TYPE(ulong16, ulong16)
+
+typedef cl_vec<char, 2> char2;
+typedef cl_vec<char, 3> char3;
+typedef cl_vec<char, 4> char4;
+typedef cl_vec<char, 8> char8;
+typedef cl_vec<char, 16> char16;
+typedef cl_vec<unsigned char, 2> uchar2;
+typedef cl_vec<unsigned char, 3> uchar3;
+typedef cl_vec<unsigned char, 4> uchar4;
+typedef cl_vec<unsigned char, 8> uchar8;
+typedef cl_vec<unsigned char, 16> uchar16;
+ABS_TEST_DIFF_TYPE(char2, uchar2)
+ABS_TEST_DIFF_TYPE(char3, uchar3)
+ABS_TEST_DIFF_TYPE(char4, uchar4)
+ABS_TEST_DIFF_TYPE(char8, uchar8)
+ABS_TEST_DIFF_TYPE(char16, uchar16)
+ABS_TEST_DIFF_TYPE(uchar2, uchar2)
+ABS_TEST_DIFF_TYPE(uchar3, uchar3)
+ABS_TEST_DIFF_TYPE(uchar4, uchar4)
+ABS_TEST_DIFF_TYPE(uchar8, uchar8)
+ABS_TEST_DIFF_TYPE(uchar16, uchar16)
+
+
+typedef cl_vec<short, 2> short2;
+typedef cl_vec<short, 3> short3;
+typedef cl_vec<short, 4> short4;
+typedef cl_vec<short, 8> short8;
+typedef cl_vec<short, 16> short16;
+typedef cl_vec<unsigned short, 2> ushort2;
+typedef cl_vec<unsigned short, 3> ushort3;
+typedef cl_vec<unsigned short, 4> ushort4;
+typedef cl_vec<unsigned short, 8> ushort8;
+typedef cl_vec<unsigned short, 16> ushort16;
+ABS_TEST_DIFF_TYPE(short2, ushort2)
+ABS_TEST_DIFF_TYPE(short3, ushort3)
+ABS_TEST_DIFF_TYPE(short4, ushort4)
+ABS_TEST_DIFF_TYPE(short8, ushort8)
+ABS_TEST_DIFF_TYPE(short16, ushort16)
+ABS_TEST_DIFF_TYPE(ushort2, ushort2)
+ABS_TEST_DIFF_TYPE(ushort3, ushort3)
+ABS_TEST_DIFF_TYPE(ushort4, ushort4)
+ABS_TEST_DIFF_TYPE(ushort8, ushort8)
+ABS_TEST_DIFF_TYPE_END(ushort16, ushort16)
diff --git a/utests/compiler_address_space.cpp b/utests/compiler_address_space.cpp
new file mode 100644
index 0000000..89c7a38
--- /dev/null
+++ b/utests/compiler_address_space.cpp
@@ -0,0 +1,10 @@
+#include "utest_helper.hpp"
+
+void compiler_address_space(void)
+{
+  OCL_CREATE_KERNEL("compiler_address_space");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_address_space);
+
+
diff --git a/utests/compiler_argument_structure.cpp b/utests/compiler_argument_structure.cpp
new file mode 100644
index 0000000..22464a5
--- /dev/null
+++ b/utests/compiler_argument_structure.cpp
@@ -0,0 +1,28 @@
+#include "utest_helper.hpp"
+
+struct hop { int x, y; };
+
+void compiler_argument_structure(void)
+{
+  const size_t n = 2048;
+  hop h = {3, 4};
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_argument_structure");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(hop), &h);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 7);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_argument_structure);
+
diff --git a/utests/compiler_argument_structure_indirect.cpp b/utests/compiler_argument_structure_indirect.cpp
new file mode 100644
index 0000000..a4584d5
--- /dev/null
+++ b/utests/compiler_argument_structure_indirect.cpp
@@ -0,0 +1,29 @@
+#include "utest_helper.hpp"
+
+struct hop { int x[16]; };
+
+void compiler_argument_structure_indirect(void)
+{
+  const size_t n = 2048;
+  hop h;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_argument_structure_indirect");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  for (int i = 0; i < 16; ++i) h.x[i] = i;
+  OCL_SET_ARG(1, sizeof(hop), &h);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 7);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_argument_structure_indirect);
+
diff --git a/utests/compiler_arith_shift_right.cpp b/utests/compiler_arith_shift_right.cpp
new file mode 100644
index 0000000..6485571
--- /dev/null
+++ b/utests/compiler_arith_shift_right.cpp
@@ -0,0 +1,43 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+  dst[global_id] = src[global_id] >> 24;
+}
+
+void compiler_arith_shift_right(void)
+{
+  const size_t n = 16;
+  int cpu_src[16];
+  int cpu_dst[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_arith_shift_right");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((int*)buf_data[0])[i] = 0x80000000 | rand();
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      OCL_ASSERT(((int *)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_arith_shift_right);
diff --git a/utests/compiler_array.cpp b/utests/compiler_array.cpp
new file mode 100644
index 0000000..8806c99
--- /dev/null
+++ b/utests/compiler_array.cpp
@@ -0,0 +1,28 @@
+#include "utest_helper.hpp"
+
+void compiler_array(void)
+{
+  const size_t n = 16;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_array");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  // First control flow
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+  OCL_UNMAP_BUFFER(0);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 16; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 3);
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_array);
+
diff --git a/utests/compiler_array0.cpp b/utests/compiler_array0.cpp
new file mode 100644
index 0000000..7cf2bbb
--- /dev/null
+++ b/utests/compiler_array0.cpp
@@ -0,0 +1,54 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+  int i;
+  int final[16];
+  for (i = 0; i < 16; ++i) {
+    int array[16], j;
+    for (j = 0; j < 16; ++j)
+      array[j] = global_id;
+    for (j = 0; j < src[0]; ++j)
+      array[j] = 1+src[j];
+    final[i] = array[i];
+  }
+  dst[global_id] = final[global_id];
+}
+
+void compiler_array0(void)
+{
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_array0");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < 11; ++i)
+      OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_array0);
+
+
diff --git a/utests/compiler_array1.cpp b/utests/compiler_array1.cpp
new file mode 100644
index 0000000..fe1ecec
--- /dev/null
+++ b/utests/compiler_array1.cpp
@@ -0,0 +1,52 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+  int final[16];
+  for (int i = 0; i < 16; ++i) {
+    int array[16];
+    for (int j = 0; j < src[0]; ++j)
+      array[j] = 1+src[0];
+    for (int j = src[0]; j < 16; ++j)
+      array[j] = global_id;
+    final[i] = array[i];
+  }
+  dst[global_id] = final[global_id];
+}
+
+void compiler_array1(void)
+{
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_array1");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < 11; ++i)
+      OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_array1);
+
diff --git a/utests/compiler_array2.cpp b/utests/compiler_array2.cpp
new file mode 100644
index 0000000..61ca9da
--- /dev/null
+++ b/utests/compiler_array2.cpp
@@ -0,0 +1,50 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+  int final[16];
+  int array[16];
+  for (int j = 0; j < 16; ++j) array[j] = j;
+  for (int j = 0; j < 16; ++j) final[j] = j+1;
+  if (global_id == 15)
+    dst[global_id] = final[global_id];
+  else
+    dst[global_id] = array[15 - global_id];
+}
+
+void compiler_array2(void)
+{
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_array2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < 11; ++i)
+      OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_array2);
+
diff --git a/utests/compiler_array3.cpp b/utests/compiler_array3.cpp
new file mode 100644
index 0000000..865b1e5
--- /dev/null
+++ b/utests/compiler_array3.cpp
@@ -0,0 +1,51 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+  int tmp[32];
+  for (int i = 0; i < 16; ++i) {
+    for (int j = 0; j < 16; ++j)
+      tmp[j] = global_id;
+    for (int j = 0; j < src[0]; ++j)
+      tmp[j] = 1+src[j];
+    tmp[16+i] = tmp[i];
+  }
+  dst[global_id] = tmp[16+global_id];
+}
+
+void compiler_array3(void)
+{
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_array3");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < 11; ++i)
+      OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_array3);
+
diff --git a/utests/compiler_async_copy.cpp b/utests/compiler_async_copy.cpp
new file mode 100644
index 0000000..ad661c0
--- /dev/null
+++ b/utests/compiler_async_copy.cpp
@@ -0,0 +1,55 @@
+#include "utest_helper.hpp"
+#include <stdint.h>
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+#define DEF(TYPE, KER_TYPE, VEC_SIZE) \
+static void compiler_async_copy_##KER_TYPE##VEC_SIZE(void) \
+{ \
+  const size_t n = 1024; \
+  const size_t local_size = 32; \
+  const int copiesPerWorkItem = 5; \
+\
+  /* Setup kernel and buffers */\
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_async_copy", "compiler_async_copy_" # KER_TYPE # VEC_SIZE); \
+  OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(TYPE) * VEC_SIZE, NULL); \
+  OCL_CREATE_BUFFER(buf[1], 0, n * copiesPerWorkItem * sizeof(TYPE) * VEC_SIZE, NULL); \
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); \
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); \
+  OCL_SET_ARG(2, local_size*copiesPerWorkItem*sizeof(TYPE)*VEC_SIZE, NULL); \
+  OCL_SET_ARG(3, sizeof(int), &copiesPerWorkItem); \
+\
+  OCL_MAP_BUFFER(1); \
+  for (uint32_t i = 0; i < n * copiesPerWorkItem * VEC_SIZE; ++i) \
+      ((TYPE*)buf_data[1])[i] = rand(); \
+  OCL_UNMAP_BUFFER(1); \
+\
+  /* Run the kernel */\
+  globals[0] = n; \
+  locals[0] = local_size; \
+  OCL_NDRANGE(1); \
+  OCL_MAP_BUFFER(0); \
+  OCL_MAP_BUFFER(1); \
+\
+  /* Check results */\
+  TYPE *dst = (TYPE*)buf_data[0]; \
+  TYPE *src = (TYPE*)buf_data[1]; \
+  for (uint32_t i = 0; i < n * copiesPerWorkItem * VEC_SIZE; i++) \
+    OCL_ASSERT(dst[i] == src[i]); \
+  OCL_UNMAP_BUFFER(0); \
+  OCL_UNMAP_BUFFER(1); \
+} \
+\
+MAKE_UTEST_FROM_FUNCTION(compiler_async_copy_##KER_TYPE##VEC_SIZE);
+
+DEF(char, char, 2);
+DEF(uchar, uchar, 2);
+DEF(short, short, 2);
+DEF(ushort, ushort, 2);
+DEF(int, int, 2);
+DEF(uint, uint, 2);
+DEF(int64_t, long, 2);
+DEF(uint64_t, ulong, 2);
+DEF(float, float, 2);
+//DEF(double, double, 2);
diff --git a/utests/compiler_async_copy_and_prefetch.cpp b/utests/compiler_async_copy_and_prefetch.cpp
new file mode 100644
index 0000000..323faf9
--- /dev/null
+++ b/utests/compiler_async_copy_and_prefetch.cpp
@@ -0,0 +1,10 @@
+#include "utest_helper.hpp"
+
+void compiler_async_copy_and_prefetch(void)
+{
+  OCL_CREATE_KERNEL("compiler_async_copy_and_prefetch");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_async_copy_and_prefetch);
+
+
diff --git a/utests/compiler_async_stride_copy.cpp b/utests/compiler_async_stride_copy.cpp
new file mode 100644
index 0000000..2e9eaeb
--- /dev/null
+++ b/utests/compiler_async_stride_copy.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+static void compiler_async_stride_copy(void)
+{
+  const size_t n = 1024;
+  const size_t local_size = 128;
+  const int copiesPerWorkItem = 5;
+  const int stride =3;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_async_stride_copy");
+  OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(char) * 4 * stride, NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * copiesPerWorkItem * sizeof(char) * 4 * stride, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, local_size*copiesPerWorkItem*sizeof(char)*4, NULL);
+  OCL_SET_ARG(3, sizeof(int), &copiesPerWorkItem);
+  OCL_SET_ARG(4, sizeof(int), &stride);
+
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n * copiesPerWorkItem * 4 * stride; ++i)
+      ((char*)buf_data[1])[i] = rand() & 0xff;
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = local_size;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+
+  // Check results
+  char *dst = (char*)buf_data[0];
+  char *src = (char*)buf_data[1];
+  for (uint32_t i = 0; i < n * copiesPerWorkItem; i += stride * 4) {
+    OCL_ASSERT(dst[i + 0] == (char)(src[i + 0] + 3));
+    OCL_ASSERT(dst[i + 1] == (char)(src[i + 1] + 3));
+    OCL_ASSERT(dst[i + 2] == (char)(src[i + 2] + 3));
+    OCL_ASSERT(dst[i + 3] == (char)(src[i + 3] + 3));
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_async_stride_copy);
diff --git a/utests/compiler_atomic_functions.cpp b/utests/compiler_atomic_functions.cpp
new file mode 100644
index 0000000..65f1c5a
--- /dev/null
+++ b/utests/compiler_atomic_functions.cpp
@@ -0,0 +1,97 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+#include <string.h>
+
+#define GROUP_NUM 16
+#define LOCAL_SIZE 256
+static void cpu_compiler_atomic(int *dst, int *src)
+{
+  dst[4] = 0xffffffff;
+  int tmp[16] = { 0 };
+  tmp[4] = -1;
+  for(int j=0; j<LOCAL_SIZE; j++) {
+    int i = j % 12;
+
+    switch(i) {
+      case 0: tmp[i] += 1; break;
+      case 1: tmp[i] -= 1; break;
+      case 2: tmp[i] += src[j]; break;
+      case 3: tmp[i] -= src[j]; break;
+      case 4: tmp[i] &= ~(src[j]<<(j>>4)); break;
+      case 5: tmp[i] |= src[j]<<(j>>4); break;
+      case 6: tmp[i] ^= src[j]; break;
+      case 7: tmp[i] = tmp[i] < -src[j] ? tmp[i] : -src[j]; break;
+      case 8: tmp[i] = tmp[i] > src[j] ? tmp[i] : src[j]; break;
+      case 9: tmp[i] = (unsigned int)tmp[i] < (unsigned int)(-src[j]) ? tmp[i] : -src[j]; break;
+      case 10: tmp[i] = (unsigned int)tmp[i] > (unsigned int)(src[j]) ? tmp[i] : src[j]; break;
+      case 11:  tmp[i] = src[10]; break;
+      default:  break;
+    }
+  }
+
+  for(int k=0; k<GROUP_NUM; k++) {
+    for(int j=0; j<LOCAL_SIZE; j++) {
+      int i = j % 12;
+
+      switch(i) {
+        case 0: dst[i] += 1; break;
+        case 1: dst[i] -= 1; break;
+        case 2: dst[i] += src[j]; break;
+        case 3: dst[i] -= src[j]; break;
+        case 4: dst[i] &= ~(src[j]<<(j>>4)); break;
+        case 5: dst[i] |= src[j]<<(j>>4); break;
+        case 6: dst[i] ^= src[j]; break;
+        case 7: dst[i] = dst[i] < -src[j] ? dst[i] : -src[j]; break;
+        case 8: dst[i] = dst[i] > src[j] ? dst[i] : src[j]; break;
+        case 9: dst[i] = (unsigned int)dst[i] < (unsigned int)(-src[j]) ? dst[i] : -src[j]; break;
+        case 10: dst[i] = (unsigned int)dst[i] > (unsigned int)(src[j]) ? dst[i] : src[j]; break;
+        case 11:  dst[i] = src[10]; break;
+        default:  break;
+      }
+    }
+  }
+
+  for(int i=0; i<12; i++)
+    dst[i+12] = tmp[i];
+}
+
+static void compiler_atomic_functions(void)
+{
+  const size_t n = GROUP_NUM * LOCAL_SIZE;
+  int cpu_dst[24] = {0}, cpu_src[256];
+
+  globals[0] = n;
+  locals[0] = LOCAL_SIZE;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_atomic_functions");
+  OCL_CREATE_BUFFER(buf[0], 0, 24 * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, locals[0] * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, 16 * sizeof(int), NULL);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
+
+  OCL_MAP_BUFFER(0);
+  memset(buf_data[0], 0, 24 * sizeof(int));
+  ((int *)buf_data[0])[4] = -1;
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < locals[0]; ++i)
+      cpu_src[i] = ((int*)buf_data[1])[i] = rand() & 0xff;
+  cpu_compiler_atomic(cpu_dst, cpu_src);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  for(int i=0; i<24; i++) {
+    //printf("The dst(%d) gpu(0x%x) cpu(0x%x)\n", i, ((uint32_t *)buf_data[0])[i], cpu_dst[i]);
+    OCL_ASSERT(((int *)buf_data[0])[i] == cpu_dst[i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_atomic_functions)
diff --git a/utests/compiler_basic_arithmetic.cpp b/utests/compiler_basic_arithmetic.cpp
new file mode 100644
index 0000000..ba05de0
--- /dev/null
+++ b/utests/compiler_basic_arithmetic.cpp
@@ -0,0 +1,115 @@
+#include "utest_helper.hpp"
+
+enum eTestOP {
+  TEST_OP_ADD =0,
+  TEST_OP_SUB,
+  TEST_OP_MUL,
+  TEST_OP_DIV,
+  TEST_OP_REM
+};
+
+template <typename T, eTestOP op>
+static void test_exec(const char* kernel_name)
+{
+  const size_t n = 160;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_basic_arithmetic", kernel_name);
+  buf_data[0] = (T*) malloc(sizeof(T) * n);
+  buf_data[1] = (T*) malloc(sizeof(T) * n);
+  for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[0])[i] = (T) rand();
+  for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[1])[i] = (T) rand();
+  if(op == TEST_OP_DIV || op == TEST_OP_REM) {
+    for (uint32_t i = 0; i < n; ++i) {
+      if(((T*)buf_data[1])[i] == 0)
+       ((T*)buf_data[1])[i] = (T) 1;
+    }
+  }
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[1]);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(T), NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(2);
+  if(op == TEST_OP_SUB) {
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] - ((T*)buf_data[1])[i]));
+  } else if(op == TEST_OP_ADD) {
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] + ((T*)buf_data[1])[i]));
+  } else if(op == TEST_OP_MUL) {
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] * ((T*)buf_data[1])[i]));
+  } else if(op == TEST_OP_DIV) {
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] / ((T*)buf_data[1])[i]));
+  } else {
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((T*)buf_data[2])[i] == (T)(((T*)buf_data[0])[i] % ((T*)buf_data[1])[i]));
+  }
+  free(buf_data[0]);
+  free(buf_data[1]);
+  buf_data[0] = buf_data[1] = NULL;
+}
+
+#define DECL_TEST_SUB(type, alias, keep_program) \
+static void compiler_sub_ ##alias(void)\
+{\
+  test_exec<type, TEST_OP_SUB>("compiler_sub_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_sub_ ## alias, keep_program)
+
+#define DECL_TEST_ADD(type, alias, keep_program) \
+static void compiler_add_ ##alias(void)\
+{\
+  test_exec<type, TEST_OP_ADD>("compiler_add_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_add_ ## alias, keep_program)
+
+#define DECL_TEST_MUL(type, alias, keep_program) \
+static void compiler_mul_ ##alias(void)\
+{\
+  test_exec<type, TEST_OP_MUL>("compiler_mul_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_mul_ ## alias, keep_program)
+
+#define DECL_TEST_DIV(type, alias, keep_program) \
+static void compiler_div_ ##alias(void)\
+{\
+  test_exec<type, TEST_OP_DIV>("compiler_div_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_div_ ## alias, keep_program)
+
+#define DECL_TEST_REM(type, alias, keep_program) \
+static void compiler_rem_ ##alias(void)\
+{\
+  test_exec<type, TEST_OP_REM>("compiler_rem_" # alias);\
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_rem_ ## alias, keep_program)
+
+#define _DECL_TEST_FOR_ALL_TYPE(op, keep_program) \
+DECL_TEST_##op(int8_t, char, true) \
+DECL_TEST_##op(uint8_t, uchar, true) \
+DECL_TEST_##op(int16_t, short, true) \
+DECL_TEST_##op(uint16_t, ushort, true) \
+DECL_TEST_##op(int32_t, int, true) \
+DECL_TEST_##op(uint32_t, uint, keep_program)
+
+#define DECL_TEST_FOR_ALL_TYPE(op) _DECL_TEST_FOR_ALL_TYPE(op, true)
+
+#define DECL_TEST_FOR_ALL_TYPE_END(op) _DECL_TEST_FOR_ALL_TYPE(op, false)
+
+DECL_TEST_FOR_ALL_TYPE(SUB)
+DECL_TEST_FOR_ALL_TYPE(ADD)
+DECL_TEST_FOR_ALL_TYPE(MUL)
+DECL_TEST_FOR_ALL_TYPE(DIV)
+DECL_TEST_FOR_ALL_TYPE_END(REM)
+#undef DECL_TEST_FOR_ALL_TYPE
diff --git a/utests/compiler_bool_cross_basic_block.cpp b/utests/compiler_bool_cross_basic_block.cpp
new file mode 100644
index 0000000..908edc0
--- /dev/null
+++ b/utests/compiler_bool_cross_basic_block.cpp
@@ -0,0 +1,55 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst, int scale) {
+  bool isRedRow = false;
+  bool isRed;
+  int val = src[global_id];
+  for (int i=0; i<scale; i++, isRedRow = !isRedRow) {
+    if (isRedRow) {
+      isRed= false;
+      for (int j=0; j < scale; j++, isRed=!isRed) {
+        if (isRed) {
+	  val++;
+        }
+      }
+    }
+  }
+  dst[global_id] = val;
+}
+
+void compiler_bool_cross_basic_block(void){
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16];
+  int scale = 4;
+	
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_bool_cross_basic_block");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(int), &scale);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    cpu_src[i] = ((int*)buf_data[0])[i] = i;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    cpu(i, cpu_src, cpu_dst, scale);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    OCL_ASSERT(((int *)buf_data[1])[i] == cpu_dst[i]);
+  OCL_UNMAP_BUFFER(1);
+
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_bool_cross_basic_block)
diff --git a/utests/compiler_box_blur.cpp b/utests/compiler_box_blur.cpp
new file mode 100644
index 0000000..e4e053e
--- /dev/null
+++ b/utests/compiler_box_blur.cpp
@@ -0,0 +1,43 @@
+#include "utest_helper.hpp"
+#include <cmath>
+
+static int w = 0;
+static int h = 0;
+static int sz = 0;
+static const size_t chunk = 64;
+static int *src = NULL, *dst = NULL;
+
+static void compiler_box_blur()
+{
+  OCL_CREATE_KERNEL("compiler_box_blur");
+
+  /* Load the picture */
+  src = cl_read_bmp("lenna128x128.bmp", &w, &h);
+  sz = w * h * sizeof(int);
+
+  /* Run the kernel */
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, sz, src);
+  OCL_CREATE_BUFFER(buf[1], 0, sz, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(int), &w);
+  OCL_SET_ARG(3, sizeof(int), &h);
+  OCL_SET_ARG(4, sizeof(int), &chunk);
+  globals[0] = size_t(w/4);
+  globals[1] = h/chunk + ((h%chunk)?1:0);
+  locals[0] = 16;
+  locals[1] = 1;
+  free(src);
+  OCL_NDRANGE(2);
+  OCL_MAP_BUFFER(1);
+  dst = (int*) buf_data[1];
+
+  /* Save the image (for debug purpose) */
+  cl_write_bmp(dst, w, h, "compiler_box_blur.bmp");
+
+  /* Compare with the golden image */
+  OCL_CHECK_IMAGE(dst, w, h, "compiler_box_blur_ref.bmp");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_box_blur);
+
diff --git a/utests/compiler_box_blur_float.cpp b/utests/compiler_box_blur_float.cpp
new file mode 100644
index 0000000..a3c97bc
--- /dev/null
+++ b/utests/compiler_box_blur_float.cpp
@@ -0,0 +1,65 @@
+#include "utest_helper.hpp"
+#include <cmath>
+
+static int *tmp = NULL;
+static struct float4 {float x,y,z,w;} *src = NULL, *dst = NULL;
+static int w = 0;
+static int h = 0;
+static int sz = 0;
+static const size_t chunk = 64;
+
+static void compiler_box_blur_float()
+{
+  OCL_CREATE_KERNEL("compiler_box_blur_float");
+
+  /* Load the picture */
+  tmp = cl_read_bmp("lenna128x128.bmp", &w, &h);
+  sz = w * h * sizeof(float[4]);
+  src = (float4*)malloc(sz);
+
+  /* RGBA -> float4 conversion */
+  const int n = w*h;
+  for (int i = 0; i < n; ++i) {
+    src[i].x = (float) (tmp[i] & 0xff);
+    src[i].y = (float) ((tmp[i] >> 8) & 0xff);
+    src[i].z = (float) ((tmp[i] >> 16) & 0xff);
+    src[i].w = 0.f;
+  }
+  free(tmp);
+
+  /* Run the kernel */
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, sz, src);
+  OCL_CREATE_BUFFER(buf[1], 0, sz, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(int), &w);
+  OCL_SET_ARG(3, sizeof(int), &h);
+  OCL_SET_ARG(4, sizeof(int), &chunk);
+  globals[0] = size_t(w);
+  globals[1] = h/chunk + ((h%chunk)?1:0);
+  locals[0] = 16;
+  locals[1] = 1;
+  free(src);
+  OCL_NDRANGE(2);
+  OCL_MAP_BUFFER(1);
+  dst = (float4*) buf_data[1];
+
+  /* Convert back to RGBA and save */
+  int *tmp = (int*) malloc(n*sizeof(int));
+  for (int i = 0; i < n; ++i) {
+    int to = int(std::min(dst[i].x, 255.f));
+    to |= int(std::min(dst[i].y, 255.f)) << 8;
+    to |= int(std::min(dst[i].z, 255.f)) << 16;
+    tmp[i] = to;
+  }
+
+  /* Save the image (for debug purpose) */
+  cl_write_bmp(tmp, w, h, "compiler_box_blur_float.bmp");
+
+  /* Compare with the golden image */
+  OCL_CHECK_IMAGE(tmp, w, h, "compiler_box_blur_float_ref.bmp");
+  free(tmp);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_box_blur_float);
+
diff --git a/utests/compiler_box_blur_image.cpp b/utests/compiler_box_blur_image.cpp
new file mode 100644
index 0000000..d94a97c
--- /dev/null
+++ b/utests/compiler_box_blur_image.cpp
@@ -0,0 +1,52 @@
+#include "utest_helper.hpp"
+
+static void compiler_box_blur_image()
+{
+  int w, h;
+  cl_image_format format = { };
+  cl_image_desc desc = { };
+  size_t origin[3] = { };
+  size_t region[3];
+  int *src, *dst;
+
+  OCL_CREATE_KERNEL("compiler_box_blur_image");
+
+  /* Load the picture */
+  src = cl_read_bmp("lenna128x128.bmp", &w, &h);
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNORM_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = 1;
+  desc.image_row_pitch = w*sizeof(uint32_t);
+
+  /* Run the kernel */
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, src);
+  free(src);
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+  dst = (int*)malloc(w*h*sizeof(uint32_t));
+  region[0] = w;
+  region[1] = h;
+  region[2] = 1;
+  OCL_READ_IMAGE(buf[1], origin, region, dst);
+
+  /* Save the image (for debug purpose) */
+  cl_write_bmp(dst, w, h, "compiler_box_blur_image.bmp");
+
+  /* Compare with the golden image */
+  OCL_CHECK_IMAGE(dst, w, h, "compiler_box_blur_ref.bmp");
+
+  free(dst);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_box_blur_image);
diff --git a/utests/compiler_byte_scatter.cpp b/utests/compiler_byte_scatter.cpp
new file mode 100644
index 0000000..11300da
--- /dev/null
+++ b/utests/compiler_byte_scatter.cpp
@@ -0,0 +1,24 @@
+#include "utest_helper.hpp"
+
+static void compiler_byte_scatter(void)
+{
+  const size_t n = 128;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_byte_scatter");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int8_t), NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    OCL_ASSERT(((int8_t*)buf_data[0])[i] == (int8_t) i);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_byte_scatter);
+
diff --git a/utests/compiler_ceil.cpp b/utests/compiler_ceil.cpp
new file mode 100644
index 0000000..29c7551
--- /dev/null
+++ b/utests/compiler_ceil.cpp
@@ -0,0 +1,43 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, float *src, float *dst) {
+  dst[global_id] = ceilf(src[global_id]);
+}
+
+void compiler_ceil(void)
+{
+  const size_t n = 16;
+  float cpu_dst[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_ceil");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_ceil);
diff --git a/utests/compiler_cl_finish.cpp b/utests/compiler_cl_finish.cpp
new file mode 100644
index 0000000..7c7dee3
--- /dev/null
+++ b/utests/compiler_cl_finish.cpp
@@ -0,0 +1,50 @@
+#include "utest_helper.hpp"
+#include <sys/time.h>
+
+#define T_GET(t)        gettimeofday(&t, NULL);
+#define T_LAPSE(t1, t2) \
+  ((t2.tv_sec+t2.tv_usec*0.000001) - (t1.tv_sec+t1.tv_usec*0.000001))
+
+static void compiler_cl_finish(void)
+{
+  const size_t n = 16*1024*1024;
+  struct timeval t1, t2;
+  float t_fin, t_map_w_fin,t_map_wo_fin;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("test_cl_finish");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+
+  // Run the kernel
+  locals[0]  = 64;
+  globals[0] = 32 * locals[0];
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(int), &n);
+  OCL_SET_ARG(3, sizeof(int), &globals[0]);
+
+  // 1st time map after clFinish
+  OCL_NDRANGE(1);
+  T_GET(t1);
+  OCL_FINISH();
+  T_GET(t2);
+  t_fin = T_LAPSE(t1, t2);
+
+  T_GET(t1);
+  OCL_MAP_BUFFER(0);
+  T_GET(t2);
+  t_map_w_fin = T_LAPSE(t1, t2);
+
+  // 2nd time map without clFinish
+  OCL_NDRANGE(1);
+  T_GET(t1);
+  OCL_MAP_BUFFER(0);
+  T_GET(t2);
+  t_map_wo_fin = T_LAPSE(t1, t2);
+
+  OCL_ASSERT(t_fin > t_map_w_fin && t_map_wo_fin > t_map_w_fin);
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_cl_finish);
diff --git a/utests/compiler_clz_int.cpp b/utests/compiler_clz_int.cpp
new file mode 100644
index 0000000..c12cfc6
--- /dev/null
+++ b/utests/compiler_clz_int.cpp
@@ -0,0 +1,31 @@
+#include "utest_helper.hpp"
+
+void compiler_clz_int(void)
+{
+  const int n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_clz_int");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  ((int*)buf_data[0])[0] = 0;
+  for (int32_t i = 1; i < (int32_t) n; ++i)
+    ((int*)buf_data[0])[i] = 0xffffffffu >> i;
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  OCL_ASSERT(((int*)buf_data[1])[0] == 32);
+  for (int i = 1; i < n; ++i)
+    OCL_ASSERT(((int*)buf_data[1])[i] == i);
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_clz_int);
diff --git a/utests/compiler_clz_short.cpp b/utests/compiler_clz_short.cpp
new file mode 100644
index 0000000..eb3a370
--- /dev/null
+++ b/utests/compiler_clz_short.cpp
@@ -0,0 +1,31 @@
+#include "utest_helper.hpp"
+
+void compiler_clz_short(void)
+{
+  const size_t n = 16;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_clz_short");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  ((short*)buf_data[0])[0] = 0;
+  for (int32_t i = 1; i < (int32_t) n; ++i)
+    ((short*)buf_data[0])[i] = 0xffffu >> i;
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  OCL_ASSERT(((short*)buf_data[1])[0] == 16);
+  for (unsigned i = 1; i < (unsigned) n; ++i)
+    OCL_ASSERT(((short*)buf_data[1])[i] == (short)i);
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_clz_short);
diff --git a/utests/compiler_constant_expr.cpp b/utests/compiler_constant_expr.cpp
new file mode 100644
index 0000000..8bed03b
--- /dev/null
+++ b/utests/compiler_constant_expr.cpp
@@ -0,0 +1,35 @@
+#include "utest_helper.hpp"
+#include <math.h>
+
+static void compiler_constant_expr(void)
+{
+  const size_t n = 48;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_constant_expr");
+  buf_data[0] = (uint32_t*) malloc(sizeof(float) * n);
+  for (uint32_t i = 0; i < n; ++i) ((float*)buf_data[0])[i] = i;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(float), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i) {
+    float expect = pow(((float*)buf_data[0])[i], (i % 3) + 1);
+    float err = fabs(((float*)buf_data[1])[i] - expect);
+    OCL_ASSERT(err <= 100 * cl_FLT_ULP(expect));
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_constant_expr);
+
diff --git a/utests/compiler_convert_uchar_sat.cpp b/utests/compiler_convert_uchar_sat.cpp
new file mode 100644
index 0000000..da00041
--- /dev/null
+++ b/utests/compiler_convert_uchar_sat.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, float *src, int *dst) {
+  float f = src[global_id];
+  dst[global_id] = f > 255 ? 255 : f < 0 ? 0 : f;
+}
+
+void compiler_convert_uchar_sat(void)
+{
+  const size_t n = 16;
+  float cpu_src[16];
+  int cpu_dst[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_convert_uchar_sat");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((float*)buf_data[0])[i] = (rand() & 1023) / 2;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      OCL_ASSERT(((int *)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_convert_uchar_sat);
diff --git a/utests/compiler_copy_buffer.cpp b/utests/compiler_copy_buffer.cpp
new file mode 100644
index 0000000..8066efe
--- /dev/null
+++ b/utests/compiler_copy_buffer.cpp
@@ -0,0 +1,32 @@
+#include "utest_helper.hpp"
+
+static void compiler_copy_buffer(void)
+{
+  const size_t n = 8192 * 4;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("test_copy_buffer");
+  //OCL_CREATE_KERNEL("compiler_array");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+  for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = i;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_buffer);
+
diff --git a/utests/compiler_copy_buffer_row.cpp b/utests/compiler_copy_buffer_row.cpp
new file mode 100644
index 0000000..12c0592
--- /dev/null
+++ b/utests/compiler_copy_buffer_row.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+static void compiler_copy_buffer_row(void)
+{
+  uint32_t *src_buffer = NULL;
+  int *data_buffer = NULL;
+  const int row = 8192;
+  const int row_n = 2;
+  const int n =  row * row_n;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("test_copy_buffer_row");
+  src_buffer = (uint32_t *) malloc(sizeof(uint32_t) * n);
+  for (int32_t i = 0; i < n; ++i) src_buffer[i] = i;
+  data_buffer = (int *) malloc(sizeof(int) * 2);
+  data_buffer[0] = row;
+  data_buffer[1] = n;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), src_buffer);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], CL_MEM_COPY_HOST_PTR, 2 * sizeof(uint32_t), data_buffer);
+  free(src_buffer);
+  free(data_buffer);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check results
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_buffer_row);
+
diff --git a/utests/compiler_copy_image.cpp b/utests/compiler_copy_image.cpp
new file mode 100644
index 0000000..150fd8a
--- /dev/null
+++ b/utests/compiler_copy_image.cpp
@@ -0,0 +1,58 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_copy_image(void)
+{
+  const size_t w = 512;
+  const size_t h = 512;
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_copy_image");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h);
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w; i++)
+      ((uint32_t*)buf_data[0])[j * w + i] = j * w + i;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = w * sizeof(uint32_t);
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(sampler), &sampler);
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w; i++)
+      OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[1])[j * w + i]);
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_image);
diff --git a/utests/compiler_copy_image1.cpp b/utests/compiler_copy_image1.cpp
new file mode 100644
index 0000000..659dddc
--- /dev/null
+++ b/utests/compiler_copy_image1.cpp
@@ -0,0 +1,83 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_copy_image1(void)
+{
+  const size_t w = 512;
+  const size_t h = 512;
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_copy_image1");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h);
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w; i++)
+      ((uint32_t*)buf_data[0])[j * w + i] = j * w + i;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = w * sizeof(uint32_t);
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+  OCL_CREATE_IMAGE(buf[2], 0, &format, &desc, NULL);
+  OCL_CREATE_IMAGE(buf[3], 0, &format, &desc, NULL);
+  OCL_CREATE_IMAGE(buf[4], 0, &format, &desc, NULL);
+  OCL_CREATE_IMAGE(buf[5], 0, &format, &desc, NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(sampler), &sampler);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(4, sizeof(cl_mem), &buf[3]);
+  OCL_SET_ARG(5, sizeof(cl_mem), &buf[4]);
+  OCL_SET_ARG(6, sizeof(cl_mem), &buf[5]);
+  float w_inv = 1.0/w;
+  float h_inv = 1.0/h;
+  OCL_SET_ARG(7, sizeof(float), &w_inv);
+  OCL_SET_ARG(8, sizeof(float), &h_inv);
+
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  OCL_MAP_BUFFER(3);
+  OCL_MAP_BUFFER(4);
+  OCL_MAP_BUFFER(5);
+
+  for(uint32_t k = 0; k < 5; k++)
+  {
+    for (uint32_t j = 0; j < h; ++j)
+      for (uint32_t i = 0; i < w; i++)
+        OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[1 + k])[j * w + i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+  OCL_UNMAP_BUFFER(3);
+  OCL_UNMAP_BUFFER(4);
+  OCL_UNMAP_BUFFER(5);
+
+  OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_image1);
diff --git a/utests/compiler_copy_image_1d.cpp b/utests/compiler_copy_image_1d.cpp
new file mode 100644
index 0000000..5af6a77
--- /dev/null
+++ b/utests/compiler_copy_image_1d.cpp
@@ -0,0 +1,52 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_copy_image_1d(void)
+{
+  const size_t w = 512;
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_copy_image_1d");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w);
+  for (uint32_t i = 0; i < w; i++)
+      ((uint32_t*)buf_data[0])[i] = i;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+  desc.image_width = w;
+  desc.image_row_pitch = w * sizeof(uint32_t);
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(sampler), &sampler);
+  globals[0] = w;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < w; i++) {
+      //printf (" %x", ((uint32_t*)buf_data[1])[i]);
+      OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_image_1d);
diff --git a/utests/compiler_copy_image_3d.cpp b/utests/compiler_copy_image_3d.cpp
new file mode 100644
index 0000000..de7cd45
--- /dev/null
+++ b/utests/compiler_copy_image_3d.cpp
@@ -0,0 +1,77 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+static void compiler_copy_image_3d(void)
+{
+  const size_t w = 512;
+  const size_t h = 512;
+  const size_t depth = 4;
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_copy_image_3d");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h * depth);
+  for (uint32_t k = 0; k < depth; k++)
+    for (uint32_t j = 0; j < h; j++)
+      for (uint32_t i = 0; i < w; i++)
+        ((float*)buf_data[0])[k*w*h + j*w + i] = (k << 10) + (j << 10) + i;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNORM_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = depth;
+  desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
+
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+  memset(&desc, 0, sizeof(desc));
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = 1;
+  for(uint32_t i = 0; i < depth; i++)
+   OCL_CREATE_IMAGE(buf[2 + i], 0, &format, &desc, NULL);
+
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(sampler), &sampler);
+  for(uint32_t i = 0; i < depth; i++)
+    OCL_SET_ARG(3 + i, sizeof(cl_mem), &buf[2 + i]);
+  globals[0] = w;
+  globals[1] = h;
+  globals[2] = depth;
+  locals[0] = 64;
+  locals[1] = 1;
+  locals[2] = 1;
+  OCL_NDRANGE(3);
+
+  // Check result
+  for(uint32_t i = 0; i < depth + 2; i++)
+    OCL_MAP_BUFFER_GTT(i);
+  for (uint32_t k = 0; k < depth; k++)
+    for (uint32_t j = 0; j < h; ++j)
+      for (uint32_t i = 0; i < w; i++) {
+        OCL_ASSERT(((float*)buf_data[0])[k*w*((h+1)&-2LL) + j*w + i] == ((float*)buf_data[1])[k*w*((h+1)&-2LL) + j*w + i]);
+        OCL_ASSERT(((float*)buf_data[0])[k*w*((h+1)&-2LL) + j*w + i] == ((float*)buf_data[k + 2])[j * w + i]);
+      }
+
+  for(uint32_t i = 0; i < depth + 2; i++)
+    OCL_UNMAP_BUFFER_GTT(i);
+
+  OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_image_3d);
diff --git a/utests/compiler_data_types.cpp b/utests/compiler_data_types.cpp
new file mode 100644
index 0000000..c686cc7
--- /dev/null
+++ b/utests/compiler_data_types.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_data_types(void)
+{
+  OCL_CREATE_KERNEL("compiler_data_types");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_data_types);
+
diff --git a/utests/compiler_degrees.cpp b/utests/compiler_degrees.cpp
new file mode 100644
index 0000000..7a17ca7
--- /dev/null
+++ b/utests/compiler_degrees.cpp
@@ -0,0 +1,32 @@
+#include "utest_helper.hpp"
+
+void compiler_degrees(void)
+{
+  const int n = 32;
+  float src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_degrees");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  for (int i = 0; i < n; ++i) {
+    src[i] = ((float *)buf_data[0])[i] = rand() * 0.01f;
+  }
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i) {
+    OCL_ASSERT(((float *)buf_data[1])[i] == src[i] * (180 / 3.141592653589793F));
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_degrees);
diff --git a/utests/compiler_displacement_map_element.cpp b/utests/compiler_displacement_map_element.cpp
new file mode 100644
index 0000000..98041ec
--- /dev/null
+++ b/utests/compiler_displacement_map_element.cpp
@@ -0,0 +1,64 @@
+#include "utest_helper.hpp"
+
+typedef unsigned int uint;
+constexpr int W = 16, H = 16;
+constexpr int SIZE = W * H;
+uint in_1[SIZE];
+uint disp_map[SIZE];
+uint out_1[SIZE];
+
+uint cpu(const int cx, const int cy, const uint *in, const uint *disp_map, int w, int h) {
+  uint c = disp_map[cy * w + cx];
+  int x_pos = cx + c;
+  int y_pos = cy + c;
+  if(0 <= x_pos && x_pos < w && 0 <= y_pos && y_pos < h)
+    return in[y_pos * w + x_pos];
+  else
+    return 0;
+}
+
+void test() {
+  OCL_MAP_BUFFER(2);
+  for(int y=0; y<H; y++)
+    for(int x=0; x<W; x++) {
+      uint out = ((uint*)buf_data[2]) [y * W + x];
+      uint wish = cpu(x, y, in_1, disp_map, W, H);
+      if(out != wish)
+        printf("XXX %d %d %x %x\n", x, y, out, wish);
+      OCL_ASSERT(out == wish);
+    }
+  OCL_UNMAP_BUFFER(2);
+}
+
+void displacement_map_element(void) {
+  int i, pass;
+
+  OCL_CREATE_KERNEL("compiler_displacement_map_element");
+  OCL_CREATE_BUFFER(buf[0], 0, SIZE * sizeof(uint), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, SIZE * sizeof(uint), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, SIZE * sizeof(uint), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(W), &W);
+  OCL_SET_ARG(3, sizeof(H), &H);
+  OCL_SET_ARG(4, sizeof(cl_mem), &buf[2]);
+  globals[0] = W;
+  globals[1] = H;
+  locals[0] = 16;
+  locals[1] = 16;
+
+  for (pass = 0; pass < 8; pass ++) {
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+    for (i = 0; i < SIZE; i ++) {
+      in_1[i] = ((uint*)buf_data[0])[i] = ((rand() & 0xFFFF) << 16) | (rand() & 0xFFFF);
+      disp_map[i] = ((uint*)buf_data[1])[i] = rand() & 3;
+    }
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+    OCL_NDRANGE(2);
+    test();
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(displacement_map_element);
diff --git a/utests/compiler_double.cpp b/utests/compiler_double.cpp
new file mode 100644
index 0000000..7c54ddf
--- /dev/null
+++ b/utests/compiler_double.cpp
@@ -0,0 +1,46 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, double *src, double *dst) {
+  double f = src[global_id];
+  double d = 1.234567890123456789;
+  dst[global_id] = global_id < 14 ? (d * (f + d)) : 14;
+}
+
+void compiler_double(void)
+{
+  const size_t n = 16;
+  double cpu_dst[n], cpu_src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_double");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((double*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      OCL_ASSERT(fabs(((double*)buf_data[1])[i] - cpu_dst[i]) < 1e-4);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double);
diff --git a/utests/compiler_double_2.cpp b/utests/compiler_double_2.cpp
new file mode 100644
index 0000000..7e3ae4b
--- /dev/null
+++ b/utests/compiler_double_2.cpp
@@ -0,0 +1,47 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, float *src, double *dst) {
+  float f = src[global_id];
+  float d = 1.234567890123456789;
+  dst[global_id] = global_id < 14 ? d * (d + f) : 14;
+}
+
+void compiler_double_2(void)
+{
+  const size_t n = 16;
+  float cpu_src[n];
+  double cpu_dst[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_double_2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      OCL_ASSERT(fabs(((double*)buf_data[1])[i] - cpu_dst[i]) < 1e-4);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_2);
diff --git a/utests/compiler_double_3.cpp b/utests/compiler_double_3.cpp
new file mode 100644
index 0000000..294950d
--- /dev/null
+++ b/utests/compiler_double_3.cpp
@@ -0,0 +1,46 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, float *src, double *dst) {
+  float d = 1.234567890123456789;
+  dst[global_id] = global_id < 14 ? d : 14;
+}
+
+void compiler_double_3(void)
+{
+  const size_t n = 16;
+  float cpu_src[n];
+  double cpu_dst[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_double_3");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      OCL_ASSERT(fabs(((double*)buf_data[1])[i] - cpu_dst[i]) < 1e-4);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_3);
diff --git a/utests/compiler_double_4.cpp b/utests/compiler_double_4.cpp
new file mode 100644
index 0000000..cb25bd4
--- /dev/null
+++ b/utests/compiler_double_4.cpp
@@ -0,0 +1,40 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+void compiler_double_4(void)
+{
+  const size_t n = 16;
+  double cpu_src1[n], cpu_src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_double_4");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(double), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(double), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    cpu_src1[i] = ((double*)buf_data[0])[i] = rand() * 1e-2;
+    cpu_src2[i] = ((double*)buf_data[1])[i] = rand() * 1e-2;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    OCL_ASSERT(fabs(((double*)buf_data[2])[i] - cpu_src1[i] - cpu_src2[i]) < 1e-4);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_double_4);
diff --git a/utests/compiler_double_precision.cpp b/utests/compiler_double_precision.cpp
new file mode 100644
index 0000000..217fd18
--- /dev/null
+++ b/utests/compiler_double_precision.cpp
@@ -0,0 +1,43 @@
+#include "utest_helper.hpp"
+#include <math.h>
+
+static void double_precision_check(void)
+{
+  const size_t n = 16; //8192 * 4;
+
+  double d0 = 0.12345678912345678;
+  double d1 = 0.12355678922345678;
+  float cpu_result = d1 - d0;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("double_precision_check");
+  //OCL_CREATE_KERNEL("compiler_array");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+  for (uint32_t i = 0; i < n; ++i) ((float*)buf_data[0])[i] = 0;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  bool precisionOK = true;
+  for (uint32_t i = 0; i < n; ++i) {
+    float error = ((float*)buf_data[1])[i] - cpu_result;
+    if (error != 0)
+      precisionOK = false;
+    OCL_ASSERT((fabs(error) < 1e-4));
+  }
+  if (!precisionOK)
+    printf("\n  - WARN: GPU doesn't have correct double precision. Got %.7G, expected %.7G\n", ((float*)buf_data[1])[0], cpu_result);
+}
+
+MAKE_UTEST_FROM_FUNCTION(double_precision_check);
diff --git a/utests/compiler_fabs.cpp b/utests/compiler_fabs.cpp
new file mode 100644
index 0000000..b14f486
--- /dev/null
+++ b/utests/compiler_fabs.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, float *src, float *dst) {
+  float f = src[global_id];
+  f = f < 0 ? -f : f;
+  dst[global_id] = f;
+}
+
+void compiler_fabs(void)
+{
+  const size_t n = 16;
+  float cpu_dst[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_fabs");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fabs);
diff --git a/utests/compiler_fill_gl_image.cpp b/utests/compiler_fill_gl_image.cpp
new file mode 100644
index 0000000..87d2fcd
--- /dev/null
+++ b/utests/compiler_fill_gl_image.cpp
@@ -0,0 +1,76 @@
+#include "utest_helper.hpp"
+
+static void read_back(int tex, int width, int height, uint32_t * resultColor)
+{
+  float vertices[8] = {-1, 1, 1, 1, 1, -1, -1, -1};
+  float tex_coords[8] = {0, 0, 1, 0, 1, 1, 0, 1};
+
+  glBindTexture(GL_TEXTURE_2D, tex);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glEnable(GL_TEXTURE_2D);
+  glDisable(GL_BLEND);
+  glVertexPointer(2, GL_FLOAT, sizeof(float) * 2, vertices);
+  glEnableClientState(GL_VERTEX_ARRAY);
+  glClientActiveTexture(GL_TEXTURE0);
+  glTexCoordPointer(2, GL_FLOAT, sizeof(float) * 2, tex_coords);
+  glEnableClientState(GL_TEXTURE_COORD_ARRAY);
+  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+  glFlush();
+  OCL_SWAP_EGL_BUFFERS();
+
+  glReadPixels(0, 0, width, height, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, resultColor);
+}
+
+
+static void compiler_fill_gl_image(void)
+{
+  const size_t w = EGL_WINDOW_WIDTH;
+  const size_t h = EGL_WINDOW_HEIGHT;
+  uint32_t color = 0x123456FF;
+  uint32_t *resultColor;
+  GLuint tex;
+
+  if (eglContext == EGL_NO_CONTEXT) {
+    fprintf(stderr, "There is no valid egl context. Ignore this case.\n");
+    return;
+  }
+  // Setup kernel and images
+  glGenTextures(1, &tex);
+  glBindTexture(GL_TEXTURE_2D, tex);
+  // Must set the all filters to GL_NEAREST!
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, w, h, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, NULL);
+
+  OCL_CREATE_KERNEL("test_fill_gl_image");
+  OCL_CREATE_GL_IMAGE(buf[0], 0, GL_TEXTURE_2D, 0, tex);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(color), &color);
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  glFinish();
+  OCL_ENQUEUE_ACQUIRE_GL_OBJECTS(0);
+  OCL_NDRANGE(2);
+  OCL_FLUSH();
+
+  // Check result
+  resultColor = new uint32_t[w * h * 4];
+  if (resultColor == NULL)
+    assert(0);
+
+  read_back(tex, w, h, resultColor);
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w; i++)
+      OCL_ASSERT(resultColor[j * w + i] == color);
+  OCL_UNMAP_BUFFER(0);
+  delete resultColor;
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_gl_image);
diff --git a/utests/compiler_fill_image.cpp b/utests/compiler_fill_image.cpp
new file mode 100644
index 0000000..5a38b8c
--- /dev/null
+++ b/utests/compiler_fill_image.cpp
@@ -0,0 +1,44 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_fill_image(void)
+{
+  const size_t w = 512;
+  const size_t h = 512;
+  uint32_t color = 0x12345678;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_fill_image");
+
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(color), &color);
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w; i++)
+      OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == 0x78563412);
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_image);
diff --git a/utests/compiler_fill_image0.cpp b/utests/compiler_fill_image0.cpp
new file mode 100644
index 0000000..e6e0b1d
--- /dev/null
+++ b/utests/compiler_fill_image0.cpp
@@ -0,0 +1,42 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_fill_image0(void)
+{
+  const size_t w = 512;
+  const size_t h = 512;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_fill_image0");
+
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  OCL_MAP_BUFFER_GTT(0);
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w; i++)
+      OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == (i << 16 | j));
+  OCL_UNMAP_BUFFER_GTT(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_image0);
diff --git a/utests/compiler_fill_image_1d.cpp b/utests/compiler_fill_image_1d.cpp
new file mode 100644
index 0000000..e644c5f
--- /dev/null
+++ b/utests/compiler_fill_image_1d.cpp
@@ -0,0 +1,50 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_fill_image_1d(void)
+{
+  const size_t w = 2048;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+  desc.image_width = w;
+  desc.image_row_pitch = 0;
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_fill_image_1d");
+
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+  OCL_MAP_BUFFER_GTT(0);
+  for (uint32_t i = 0; i < w; i++) {
+      ((uint32_t*)buf_data[0])[i] = 0;
+  }
+  OCL_UNMAP_BUFFER_GTT(0);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = w/2;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER_GTT(0);
+  //printf("------ The image result is: -------\n");
+  for (uint32_t i = 0; i < w/2; i++) {
+      //printf(" %2x", ((uint32_t *)buf_data[0])[i]);
+      OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 0x03020100);
+  }
+  for (uint32_t i = w/2; i < w; i++) {
+      //printf(" %2x", ((uint32_t *)buf_data[0])[i]);
+      OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 0);
+  }
+  OCL_UNMAP_BUFFER_GTT(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_image_1d);
diff --git a/utests/compiler_fill_image_3d.cpp b/utests/compiler_fill_image_3d.cpp
new file mode 100644
index 0000000..ec96e80
--- /dev/null
+++ b/utests/compiler_fill_image_3d.cpp
@@ -0,0 +1,50 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_fill_image_3d(void)
+{
+  const size_t w = 512;
+  const size_t h = 512;
+  const size_t depth = 5;
+  uint32_t color = 0x12345678;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = depth;
+  desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_fill_image_3d");
+
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(color), &color);
+  globals[0] = w;
+  globals[1] = h;
+  globals[2] = depth;
+  locals[0] = 16;
+  locals[1] = 16;
+  locals[2] = 1;
+  OCL_NDRANGE(3);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  for (uint32_t k = 0; k < depth; k++)
+    for (uint32_t j = 0; j < h; ++j)
+      for (uint32_t i = 0; i < w; i++)
+        OCL_ASSERT(((uint32_t*)buf_data[0])[k*w*h + j*w + i] == 0x78563412);
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_image_3d);
diff --git a/utests/compiler_fill_image_3d_2.cpp b/utests/compiler_fill_image_3d_2.cpp
new file mode 100644
index 0000000..410ace8
--- /dev/null
+++ b/utests/compiler_fill_image_3d_2.cpp
@@ -0,0 +1,48 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_fill_image_3d_2(void)
+{
+  const size_t w = 512;
+  const size_t h = 512;
+  const size_t depth = 5;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = depth;
+  desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_fill_image_3d_2");
+
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = w;
+  globals[1] = h;
+  globals[2] = depth;
+  locals[0] = 16;
+  locals[1] = 16;
+  locals[2] = 1;
+  OCL_NDRANGE(3);
+
+  // Check result
+  OCL_MAP_BUFFER_GTT(0);
+  for (uint32_t k = 0; k < depth; k++)
+    for (uint32_t j = 0; j < h; ++j)
+      for (uint32_t i = 0; i < w; i++)
+        OCL_ASSERT(((uint32_t*)buf_data[0])[k*w*h + j*w + i] == 0x78563412);
+  OCL_UNMAP_BUFFER_GTT(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_image_3d_2);
diff --git a/utests/compiler_function_argument.cpp b/utests/compiler_function_argument.cpp
new file mode 100644
index 0000000..a39523b
--- /dev/null
+++ b/utests/compiler_function_argument.cpp
@@ -0,0 +1,27 @@
+#include "utest_helper.hpp"
+
+void compiler_function_argument(void)
+{
+  const size_t n = 2048;
+  const int value = 34;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_function_argument");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(int), &value);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((int*)buf_data[0])[i] == value);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_argument);
+
+
diff --git a/utests/compiler_function_argument0.cpp b/utests/compiler_function_argument0.cpp
new file mode 100644
index 0000000..2e4227e
--- /dev/null
+++ b/utests/compiler_function_argument0.cpp
@@ -0,0 +1,26 @@
+#include "utest_helper.hpp"
+
+void compiler_function_argument0(void)
+{
+  const size_t n = 2048;
+  const short value = 34;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_function_argument0");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(short), &value);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((int*)buf_data[0])[i] == value);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_argument0);
+
diff --git a/utests/compiler_function_argument1.cpp b/utests/compiler_function_argument1.cpp
new file mode 100644
index 0000000..48a7677
--- /dev/null
+++ b/utests/compiler_function_argument1.cpp
@@ -0,0 +1,31 @@
+#include "utest_helper.hpp"
+
+void compiler_function_argument1(void)
+{
+  const size_t n = 2048;
+  const char value = 34;
+  const short value0 = 31;
+  const int value1 = 3;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_function_argument1");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(char), &value);
+  OCL_SET_ARG(2, sizeof(short), &value0);
+  OCL_SET_ARG(3, sizeof(int), &value1);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((int*)buf_data[0])[i] == value + value0 + value1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_argument1);
+
+
diff --git a/utests/compiler_function_argument2.cpp b/utests/compiler_function_argument2.cpp
new file mode 100644
index 0000000..c352a9e
--- /dev/null
+++ b/utests/compiler_function_argument2.cpp
@@ -0,0 +1,57 @@
+#include "utest_helper.hpp"
+
+#define VECSIZE 8
+void compiler_function_argument2(void)
+{
+  char arg0[8] = { 0 };
+  unsigned char arg1[8] = { 0 };
+  short arg2[8] = { 0 };
+  unsigned short arg3[8] = { 0 };
+  int arg4[8] = { 0 };
+  unsigned int arg5[8] = { 0 };
+  float arg6[8] = { 0 };
+
+  for (uint32_t i = 0; i < 8; ++i) {
+      arg0[i] = rand();
+      arg1[i] = rand();
+      arg2[i] = rand();
+      arg3[i] = rand();
+      arg4[i] = rand();
+      arg5[i] = rand();
+      arg6[i] = rand();
+  }
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_function_argument2");
+  OCL_CREATE_BUFFER(buf[0], 0, sizeof(float) * 8 * 8, NULL);
+  OCL_SET_ARG(0, sizeof(arg0), arg0);
+  OCL_SET_ARG(1, sizeof(arg1), arg1);
+  OCL_SET_ARG(2, sizeof(arg2), arg2);
+  OCL_SET_ARG(3, sizeof(arg3), arg3);
+  OCL_SET_ARG(4, sizeof(arg4), arg4);
+  OCL_SET_ARG(5, sizeof(arg5), arg5);
+  OCL_SET_ARG(6, sizeof(arg6), arg6);
+  OCL_SET_ARG(7, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = 1;
+  locals[0] = 1;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  /* Check results */
+  float *dst = (float*)buf_data[0];
+
+  for (uint32_t i = 0; i < 8; ++i) {
+      OCL_ASSERT((float)arg0[i] == dst[0*8 + i]);
+      OCL_ASSERT((float)arg1[i] == dst[1*8 + i]);
+      OCL_ASSERT((float)arg2[i] == dst[2*8 + i]);
+      OCL_ASSERT((float)arg3[i] == dst[3*8 + i]);
+      OCL_ASSERT((float)arg4[i] == dst[4*8 + i]);
+      OCL_ASSERT((float)arg5[i] == dst[5*8 + i]);
+      OCL_ASSERT((float)arg6[i] == dst[6*8 + i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_argument2);
diff --git a/utests/compiler_function_argument3.cpp b/utests/compiler_function_argument3.cpp
new file mode 100644
index 0000000..e9f5e80
--- /dev/null
+++ b/utests/compiler_function_argument3.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+struct sfloat8 {
+    float a;
+    float b;
+    float c;
+    float d;
+    float e;
+    float f;
+    float g;
+    float h;
+};
+
+void compiler_function_argument3(void)
+{
+  sfloat8 arg6;
+
+  arg6.a = 3.0f;
+  arg6.h = 4.0f;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_function_argument3");
+  OCL_CREATE_BUFFER(buf[0], 0, sizeof(struct sfloat8) * 8, NULL);
+
+  OCL_SET_ARG(0, sizeof(arg6), &arg6);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = 1;
+  locals[0] = 1;
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(0);
+
+  /* Check results */
+  sfloat8 *dst = (sfloat8*)buf_data[0];
+
+  OCL_ASSERT(dst[0].a == 3.0f);
+  OCL_ASSERT(dst[0].b == 12.0f);
+  OCL_ASSERT(dst[0].h == 7.0f);
+
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_argument3);
diff --git a/utests/compiler_function_constant.cpp b/utests/compiler_function_constant.cpp
new file mode 100644
index 0000000..20f0ece
--- /dev/null
+++ b/utests/compiler_function_constant.cpp
@@ -0,0 +1,34 @@
+#include "utest_helper.hpp"
+
+void compiler_function_constant(void)
+{
+  const size_t n = 2048;
+  const uint32_t value = 34;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_function_constant");
+  OCL_CREATE_BUFFER(buf[0], 0, 75 * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(uint32_t), &value);
+
+  OCL_MAP_BUFFER(0);
+  for(uint32_t i = 0; i < 69; ++i)
+    ((short *)buf_data[0])[i] = i;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(1);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t *)buf_data[1])[i] == (value + i%69));
+
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_constant);
diff --git a/utests/compiler_function_constant0.cpp b/utests/compiler_function_constant0.cpp
new file mode 100644
index 0000000..6fbbd30
--- /dev/null
+++ b/utests/compiler_function_constant0.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+void compiler_function_constant0(void)
+{
+  const size_t n = 2048;
+  const uint32_t value = 34;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_function_constant0");
+  OCL_CREATE_BUFFER(buf[0], 0, 75 * sizeof(int32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, 1 * sizeof(char), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(uint32_t), &value);
+
+  OCL_MAP_BUFFER(0);
+  for(uint32_t i = 0; i < 69; ++i)
+    ((int32_t *)buf_data[0])[i] = i;
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_MAP_BUFFER(1);
+  ((char *)buf_data[1])[0] = 15;
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t *)buf_data[2])[i] == (value + 15 + i%69));
+
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_constant0);
diff --git a/utests/compiler_function_constant1.cpp b/utests/compiler_function_constant1.cpp
new file mode 100644
index 0000000..b92e6ca
--- /dev/null
+++ b/utests/compiler_function_constant1.cpp
@@ -0,0 +1,47 @@
+#include "utest_helper.hpp"
+
+void compiler_function_constant1(void)
+{
+  const size_t n = 2048;
+  const uint32_t value = 34;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_function_constant");
+  OCL_CREATE_BUFFER(buf[0], 0, 75 * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(uint32_t), &value);
+
+  OCL_MAP_BUFFER(0);
+  for(uint32_t i = 0; i < 69; ++i)
+    ((short *)buf_data[0])[i] = i;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  OCL_CREATE_BUFFER(buf[2], 0, 101 * sizeof(short), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[2]);
+  OCL_MAP_BUFFER(2);
+  for(uint32_t i = 0; i < 69; ++i)
+    ((short *)buf_data[2])[i] = 2*i;
+  OCL_UNMAP_BUFFER(2);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t *)buf_data[1])[i] == (value + (i%69)*2));
+
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_constant1);
diff --git a/utests/compiler_function_qualifiers.cpp b/utests/compiler_function_qualifiers.cpp
new file mode 100644
index 0000000..622313c
--- /dev/null
+++ b/utests/compiler_function_qualifiers.cpp
@@ -0,0 +1,20 @@
+#include "utest_helper.hpp"
+
+void compiler_function_qualifiers(void)
+{
+  OCL_CREATE_KERNEL("compiler_function_qualifiers");
+
+  size_t param_value_size;
+  void* param_value;
+  cl_int err;
+
+  err = clGetKernelInfo(kernel, CL_KERNEL_ATTRIBUTES, 0, NULL, &param_value_size);
+  OCL_ASSERT(err == CL_SUCCESS);
+  param_value = malloc(param_value_size);
+  err = clGetKernelInfo(kernel, CL_KERNEL_ATTRIBUTES, param_value_size, param_value, NULL);
+  OCL_ASSERT(err == CL_SUCCESS);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_function_qualifiers);
+
+
diff --git a/utests/compiler_geometric_builtin.cpp b/utests/compiler_geometric_builtin.cpp
new file mode 100644
index 0000000..a9ccc2c
--- /dev/null
+++ b/utests/compiler_geometric_builtin.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_geometric_builtin(void)
+{
+  OCL_CREATE_KERNEL("compiler_geometric_builtin");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_geometric_builtin);
+
diff --git a/utests/compiler_get_image_info.cpp b/utests/compiler_get_image_info.cpp
new file mode 100644
index 0000000..3b9d132
--- /dev/null
+++ b/utests/compiler_get_image_info.cpp
@@ -0,0 +1,50 @@
+#include "utest_helper.hpp"
+
+static void compiler_get_image_info(void)
+{
+  const size_t w = 256;
+  const size_t h = 512;
+  const size_t depth = 3;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = depth;
+  desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
+  desc.num_mip_levels = 0;
+  desc.num_samples = 0;
+  desc.buffer = NULL;
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_get_image_info");
+
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, 32 * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, 32 * sizeof(int), NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = 32;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (uint32_t i = 0; i < 32; i++)
+  {
+    OCL_ASSERT(((uint32_t*)buf_data[1])[i] == ((w << 20) | (h << 8) | depth));
+    OCL_ASSERT(((uint32_t*)buf_data[2])[i] == ((CL_UNSIGNED_INT8 << 16) | CL_RGBA));
+  }
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_get_image_info);
diff --git a/utests/compiler_get_image_info_array.cpp b/utests/compiler_get_image_info_array.cpp
new file mode 100644
index 0000000..970877d
--- /dev/null
+++ b/utests/compiler_get_image_info_array.cpp
@@ -0,0 +1,64 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_get_image_info_array(void)
+{
+  const int w = 256;
+  const int h = 512;
+  const int array_size1 = 10;
+  const int array_size2 = 3;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  // Create the 1D array buffer.
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+  desc.image_width = w;
+  desc.image_array_size = array_size1;
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+  // Create the 2D array buffer.
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_array_size = array_size2;
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_get_image_info_array");
+
+  OCL_CREATE_BUFFER(buf[2], 0, 32 * sizeof(int), NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = 32;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(2);
+  OCL_ASSERT(((int*)buf_data[2])[0] == w);
+  OCL_ASSERT(((int*)buf_data[2])[1] == array_size1);
+  OCL_ASSERT(((int*)buf_data[2])[2] == CL_UNSIGNED_INT8);
+  OCL_ASSERT(((int*)buf_data[2])[3] == CL_RGBA);
+
+  OCL_ASSERT(((int*)buf_data[2])[4] == w);
+  OCL_ASSERT(((int*)buf_data[2])[5] == h);
+  OCL_ASSERT(((int*)buf_data[2])[6] == array_size2);
+  OCL_ASSERT(((int*)buf_data[2])[7] == CL_UNSIGNED_INT8);
+  OCL_ASSERT(((int*)buf_data[2])[8] == CL_RGBA);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_get_image_info_array);
diff --git a/utests/compiler_getelementptr_bitcast.cpp b/utests/compiler_getelementptr_bitcast.cpp
new file mode 100644
index 0000000..a57ff36
--- /dev/null
+++ b/utests/compiler_getelementptr_bitcast.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+void compiler_getelementptr_bitcast(void)
+{
+  const size_t n = 16;
+  float cpu_dst[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_getelementptr_bitcast");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+
+  //must be 1 to pass the test, it is required by the special usage in the kernel
+  locals[0] = 1;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i < (int32_t) n; ++i){
+      unsigned char* c = (unsigned char*)&cpu_src[i];
+      cpu_dst[i] = c[2];
+    }
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i){
+      //printf("src:%f, gpu_dst: %f, cpu_dst: %f\n", cpu_src[i], ((float *)buf_data[1])[i], cpu_dst[i]);
+      OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+    }
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_getelementptr_bitcast);
diff --git a/utests/compiler_global_constant.cpp b/utests/compiler_global_constant.cpp
new file mode 100644
index 0000000..88f9852
--- /dev/null
+++ b/utests/compiler_global_constant.cpp
@@ -0,0 +1,104 @@
+#include "utest_helper.hpp"
+
+void compiler_global_constant(void)
+{
+  const size_t n = 2048;
+  const uint32_t e = 34, r = 77;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_global_constant");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(uint32_t), &e);
+  OCL_SET_ARG(2, sizeof(uint32_t), &r);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  unsigned int m[3] = {71,72,73};
+
+  // Check results
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i)
+//    printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], m[i%3] + e + r);
+    OCL_ASSERT(((uint32_t *)buf_data[0])[i] == m[i%3] + e + r);
+  OCL_UNMAP_BUFFER(0);
+}
+
+void compiler_global_constant1(void)
+{
+  const size_t n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant", "compiler_global_constant1");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  uint32_t data1[] = {1, 4, 7};
+  uint32_t data2[]= {3, 7, 11};
+
+  // Check results
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i)
+//    printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], data1[i%3] + data2[i%3]);
+    OCL_ASSERT(((uint32_t *)buf_data[0])[i] == data1[i%3] + data2[i%3]);
+  OCL_UNMAP_BUFFER(0);
+}
+
+void compiler_global_constant2(void)
+{
+  const size_t n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant", "compiler_global_constant2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check results
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i)
+//    printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], 6);
+    OCL_ASSERT(((uint32_t *)buf_data[0])[i] == 6);
+  OCL_UNMAP_BUFFER(0);
+}
+
+void compiler_global_constant3(void)
+{
+  const size_t n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant", "compiler_global_constant3");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  uint32_t data1[] = {3, 6, 9};
+  char data2[]= {'c', 'f', 'j'};
+  // Check results
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i)
+//    printf("%d result %d reference %d\n", i, ((uint32_t *)buf_data[0])[i], data1[i%3] + (int)data2[i%3]);
+    OCL_ASSERT(((uint32_t *)buf_data[0])[i] == data1[i%3] + (uint32_t)data2[i%3]);
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_global_constant, true);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_global_constant1, true);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_global_constant2, true);
+MAKE_UTEST_FROM_FUNCTION(compiler_global_constant3);
diff --git a/utests/compiler_global_constant_2.cpp b/utests/compiler_global_constant_2.cpp
new file mode 100644
index 0000000..cbe63ae
--- /dev/null
+++ b/utests/compiler_global_constant_2.cpp
@@ -0,0 +1,59 @@
+#include "utest_helper.hpp"
+
+void compiler_global_constant_2(void)
+{
+  const size_t n = 2048;
+  const uint32_t e = 34, r = 77;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_global_constant_2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(uint32_t), &e);
+  OCL_SET_ARG(2, sizeof(uint32_t), &r);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  unsigned int m[3] = {0x15b,0x25b,0x35b};
+  unsigned int t[5] = {0x45b,0x55b,0x65b,0x75b,0x85b};
+
+  // Check results
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i)
+//    std::cout << ((uint32_t *)buf_data[0])[i] << std::endl;
+    OCL_ASSERT(((uint32_t *)buf_data[0])[i] == m[i%3] + t[i%5] + e + r);
+  OCL_UNMAP_BUFFER(0);
+}
+
+void compiler_global_constant_2_long(void)
+{
+  const size_t n = 2048;
+  const uint32_t e = 34, r = 77;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_global_constant_2", "compiler_global_constant_2_long");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(uint32_t), &e);
+  OCL_SET_ARG(2, sizeof(uint32_t), &r);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  uint64_t m[3] = {0x15b,0x25b,0xFFFFFFFFF};
+
+  // Check results
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i)
+//    std::cout << ((uint64_t *)buf_data[0])[i] << std::endl;
+    OCL_ASSERT(((uint64_t *)buf_data[0])[i] == m[i%3] + e + r);
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_global_constant_2);
+MAKE_UTEST_FROM_FUNCTION(compiler_global_constant_2_long);
diff --git a/utests/compiler_global_memory_barrier.cpp b/utests/compiler_global_memory_barrier.cpp
new file mode 100644
index 0000000..ea84e72
--- /dev/null
+++ b/utests/compiler_global_memory_barrier.cpp
@@ -0,0 +1,28 @@
+#include "utest_helper.hpp"
+
+static void compiler_global_memory_barrier(void)
+{
+  const size_t n = 16*1024;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_global_memory_barrier");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  // Run the kernel
+  globals[0] = n/2;
+  locals[0] = 256;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  uint32_t *dst = (uint32_t*)buf_data[0];
+  for (uint32_t i = 0; i < n; i+=locals[0])
+    for (uint32_t j = 0; j < locals[0]; ++j)
+        OCL_ASSERT(dst[i+j] == locals[0] - 1 -j);
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_global_memory_barrier);
diff --git a/utests/compiler_group_size.cpp b/utests/compiler_group_size.cpp
new file mode 100644
index 0000000..8ad83f0
--- /dev/null
+++ b/utests/compiler_group_size.cpp
@@ -0,0 +1,141 @@
+#include "utest_helper.hpp"
+#include <string.h>
+
+struct xyz{
+  unsigned short b;
+  unsigned short e;
+  unsigned int o;
+};
+
+void compiler_group_size1(void)
+{
+  const size_t n = 7*32*17;
+
+  int group_size[] = {7, 17, 32};
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_group_size");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  for(int i = 0; i < 3; i++) {
+    // Run the kernel
+    globals[0] = n;
+    locals[0] = group_size[i];
+    OCL_NDRANGE(1);
+    OCL_MAP_BUFFER(0);
+
+    // Check results
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i);
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+
+void compiler_group_size2(void)
+{
+  const uint32_t n = 4*17*8;
+  int size_x[] = {2, 4, 17};
+  int size_y[] = {2, 4, 4};
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_group_size");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  for(int i = 0; i < 3; i++) {
+    // Run the kernel
+    globals[0] = 4*17;
+    globals[1] = 8;
+    locals[0] = size_x[i];
+    locals[1] = size_y[i];
+    OCL_NDRANGE(2);
+    OCL_MAP_BUFFER(0);
+
+    // Check results
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i);
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+
+void compiler_group_size3(void)
+{
+  const uint32_t n = 4*17*8*4;
+  int size_x[] = {2, 4, 17};
+  int size_y[] = {2, 4, 4};
+  int size_z[] = {2, 1, 2};
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_group_size");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  for(int i = 0; i < 3; i++) {
+    // Run the kernel
+    globals[0] = 4*17;
+    globals[1] = 8;
+    globals[2] = 4;
+    locals[0] = size_x[i];
+    locals[1] = size_y[i];
+    locals[2] = size_z[i];
+    OCL_NDRANGE(3);
+    OCL_MAP_BUFFER(0);
+
+    // Check results
+    for (uint32_t i = 0; i < n; ++i)
+      OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i);
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+
+void compiler_group_size4(void)
+{
+  const size_t n = 16;
+  uint32_t color = 2;
+  uint32_t num = 1;
+  int group_size[] = {1};
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_group_size", "compiler_group_size4");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(struct xyz), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+
+  for(uint32_t i = 0; i < num; i++) {
+    // Run the kernel
+    OCL_MAP_BUFFER(0);
+    ((struct xyz*)buf_data[0])[0].b = 0;
+    ((struct xyz*)buf_data[0])[0].e = 2;
+    ((struct xyz*)buf_data[0])[0].o = 0;
+    OCL_UNMAP_BUFFER(0);
+
+    OCL_MAP_BUFFER(1);
+    memset(((uint32_t*)buf_data[1]), 0x0, sizeof(uint32_t)*n);
+    OCL_UNMAP_BUFFER(1);
+
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    OCL_SET_ARG(2, sizeof(cl_int), &group_size[i]);
+    OCL_SET_ARG(3, sizeof(cl_int), &color);
+
+    globals[0] = group_size[i];
+    locals[0] = group_size[i];
+    OCL_NDRANGE(1);
+    OCL_MAP_BUFFER(1);
+
+    // Check results
+    for (uint32_t j = 0; j < n; ++j) {
+//      std::cout <<((uint32_t*)buf_data[1])[j] << "  ";
+      if(j >= i && j <= i+2) {
+       OCL_ASSERT(((uint32_t*)buf_data[1])[j] == color);
+      } else {
+       OCL_ASSERT(((uint32_t*)buf_data[1])[j] == 0);
+      }
+
+    }
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_group_size1, true);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_group_size2, true);
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_group_size3, true);
+MAKE_UTEST_FROM_FUNCTION(compiler_group_size4);
+
diff --git a/utests/compiler_hadd.cpp b/utests/compiler_hadd.cpp
new file mode 100644
index 0000000..9723702
--- /dev/null
+++ b/utests/compiler_hadd.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+void compiler_hadd(void)
+{
+  const int n = 32;
+  int src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_hadd");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((int*)buf_data[0])[i] = rand();
+    src2[i] = ((int*)buf_data[1])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    long long a = src1[i];
+    a += src2[i];
+    a >>= 1;
+    OCL_ASSERT(((int*)buf_data[2])[i] == (int)a);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_hadd);
diff --git a/utests/compiler_if_else.cpp b/utests/compiler_if_else.cpp
new file mode 100644
index 0000000..e38b23f
--- /dev/null
+++ b/utests/compiler_if_else.cpp
@@ -0,0 +1,64 @@
+#include "utest_helper.hpp"
+
+static void compiler_if_else(void)
+{
+  const size_t n = 17;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_if_else");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+  for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = 2;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // First control flow
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 16; ++i) {
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+    OCL_ASSERT(((int32_t*)buf_data[0])[i] == 1);
+  }
+
+  // Second control flow
+  for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -1;
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 16; ++i) {
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == -2);
+    OCL_ASSERT(((int32_t*)buf_data[0])[i] == 2);
+  }
+
+  // Third control flow
+  for (uint32_t i = 0; i < 4; ++i) ((int32_t*)buf_data[0])[i] = 2;
+  for (uint32_t i = 4; i < n; ++i) ((int32_t*)buf_data[0])[i] = -1;
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 3; ++i) {
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+    OCL_ASSERT(((int32_t*)buf_data[0])[i] == 1);
+  }
+  OCL_ASSERT(((int32_t*)buf_data[1])[3] == -1);
+  OCL_ASSERT(((int32_t*)buf_data[0])[3] == 1);
+  for (uint32_t i = 4; i < 16; ++i) {
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == -2);
+    OCL_ASSERT(((int32_t*)buf_data[0])[i] == 2);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_if_else);
+
diff --git a/utests/compiler_insert_to_constant.cpp b/utests/compiler_insert_to_constant.cpp
new file mode 100644
index 0000000..c4f737f
--- /dev/null
+++ b/utests/compiler_insert_to_constant.cpp
@@ -0,0 +1,30 @@
+#include "utest_helper.hpp"
+
+void compiler_insert_to_constant(void)
+{
+  const size_t n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_insert_to_constant");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t[4]), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  uint32_t *data = (uint32_t*) buf_data[0];
+  for (uint32_t i = 0; i < n; ++i) {
+    OCL_ASSERT(data[4*i+0] == 0);
+    OCL_ASSERT(data[4*i+1] == 1);
+    OCL_ASSERT(data[4*i+2] == i);
+    OCL_ASSERT(data[4*i+3] == 3);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_insert_to_constant);
+
+
diff --git a/utests/compiler_insert_vector.cpp b/utests/compiler_insert_vector.cpp
new file mode 100644
index 0000000..c7c239f
--- /dev/null
+++ b/utests/compiler_insert_vector.cpp
@@ -0,0 +1,18 @@
+#include "utest_helper.hpp"
+
+void compiler_insert_vector(void)
+{
+  const size_t n = 2048;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_insert_vector");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int) * 4, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_insert_vector);
diff --git a/utests/compiler_insn_selection_masked_min_max.cpp b/utests/compiler_insn_selection_masked_min_max.cpp
new file mode 100644
index 0000000..6a2edcc
--- /dev/null
+++ b/utests/compiler_insn_selection_masked_min_max.cpp
@@ -0,0 +1,42 @@
+#include "utest_helper.hpp"
+#include <algorithm>
+
+static void compiler_insn_selection_masked_min_max(void)
+{
+  const size_t n = 256;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_insn_selection_masked_min_max");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+  for (uint32_t i = 0; i < n; ++i)
+    ((float*)buf_data[0])[i] = float(i);
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  float *dst = (float*)buf_data[1];
+  float *src = (float*)buf_data[0];
+  for (uint32_t i = 0; i < n; ++i) {
+    float cpu_dst;
+    if (i % 16 > 5)
+      cpu_dst = std::max(src[i], src[7]);
+    else
+      cpu_dst = std::min(src[i], src[10]);
+    OCL_ASSERT(dst[i] == cpu_dst);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_insn_selection_masked_min_max)
+
+
diff --git a/utests/compiler_insn_selection_max.cpp b/utests/compiler_insn_selection_max.cpp
new file mode 100644
index 0000000..8552b9f
--- /dev/null
+++ b/utests/compiler_insn_selection_max.cpp
@@ -0,0 +1,37 @@
+#include "utest_helper.hpp"
+#include <algorithm>
+
+static void compiler_insn_selection_max(void)
+{
+  const size_t n = 8192 * 4;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_insn_selection_max");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+  for (uint32_t i = 0; i < n; ++i)
+    ((float*)buf_data[0])[i] = float(i);
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  float *dst = (float*)buf_data[1];
+  float *src = (float*)buf_data[0];
+  for (uint32_t i = 0; i < n; ++i) {
+    OCL_ASSERT(dst[i] == std::max(src[i], src[0]));
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_insn_selection_max)
+
+
diff --git a/utests/compiler_insn_selection_min.cpp b/utests/compiler_insn_selection_min.cpp
new file mode 100644
index 0000000..f5f9d18
--- /dev/null
+++ b/utests/compiler_insn_selection_min.cpp
@@ -0,0 +1,36 @@
+#include "utest_helper.hpp"
+#include <algorithm>
+
+static void compiler_insn_selection_min(void)
+{
+  const size_t n = 8192 * 4;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_insn_selection_min");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+  for (uint32_t i = 0; i < n; ++i)
+    ((float*)buf_data[0])[i] = float(i);
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  float *dst = (float*)buf_data[1];
+  float *src = (float*)buf_data[0];
+  for (uint32_t i = 0; i < n; ++i) {
+    OCL_ASSERT(dst[i] == std::min(src[i], src[0]));
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_insn_selection_min)
+
diff --git a/utests/compiler_integer_builtin.cpp b/utests/compiler_integer_builtin.cpp
new file mode 100644
index 0000000..98ad51b
--- /dev/null
+++ b/utests/compiler_integer_builtin.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_integer_builtin(void)
+{
+  OCL_CREATE_KERNEL("compiler_integer_builtin");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_integer_builtin);
+
diff --git a/utests/compiler_integer_division.cpp b/utests/compiler_integer_division.cpp
new file mode 100644
index 0000000..3898ae1
--- /dev/null
+++ b/utests/compiler_integer_division.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst, int x) {
+  dst[global_id] = src[global_id] / x;
+}
+
+void compiler_integer_division(void)
+{
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16];
+  const int x = 7;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_integer_division");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(x), &x);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 1000;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst, x);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < 11; ++i)
+      OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_integer_division);
diff --git a/utests/compiler_integer_remainder.cpp b/utests/compiler_integer_remainder.cpp
new file mode 100644
index 0000000..100f464
--- /dev/null
+++ b/utests/compiler_integer_remainder.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst, int x) {
+  dst[global_id] = src[global_id] % x;
+}
+
+void compiler_integer_remainder(void)
+{
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16];
+  const int x = 7;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_integer_remainder");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(x), &x);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst, x);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < 11; ++i)
+      OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_integer_remainder);
diff --git a/utests/compiler_load_bool_imm.cpp b/utests/compiler_load_bool_imm.cpp
new file mode 100644
index 0000000..d060daf
--- /dev/null
+++ b/utests/compiler_load_bool_imm.cpp
@@ -0,0 +1,29 @@
+#include "utest_helper.hpp"
+
+static void compiler_load_bool_imm(void)
+{
+  const size_t n = 1024;
+  const size_t local_size = 16;
+  const int copiesPerWorkItem = 5;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_load_bool_imm");
+  OCL_CREATE_BUFFER(buf[0], 0, n * copiesPerWorkItem * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, local_size*copiesPerWorkItem*sizeof(int), NULL); // 16 x int
+  OCL_SET_ARG(2, sizeof(int), &copiesPerWorkItem); // 16 x int
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = local_size;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  int *dst = (int*)buf_data[0];
+  for (uint32_t i = 0; i < n * copiesPerWorkItem; i++)
+    OCL_ASSERT(dst[i] == copiesPerWorkItem);
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_load_bool_imm);
diff --git a/utests/compiler_local_memory_barrier.cpp b/utests/compiler_local_memory_barrier.cpp
new file mode 100644
index 0000000..6c9c98e
--- /dev/null
+++ b/utests/compiler_local_memory_barrier.cpp
@@ -0,0 +1,46 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static void compiler_local_memory_barrier(void)
+{
+  const size_t n = 1024;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_local_memory_barrier");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, 64, NULL); // 16 x int
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  uint32_t *dst = (uint32_t*)buf_data[0];
+  for (uint32_t i = 0; i < n; i+=16)
+  for (uint32_t j = 0; j < 16; ++j)
+    OCL_ASSERT(dst[i+j] == 15-j);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_local_memory_barrier);
+
diff --git a/utests/compiler_local_memory_barrier_2.cpp b/utests/compiler_local_memory_barrier_2.cpp
new file mode 100644
index 0000000..4fa090b
--- /dev/null
+++ b/utests/compiler_local_memory_barrier_2.cpp
@@ -0,0 +1,29 @@
+#include "utest_helper.hpp"
+
+static void compiler_local_memory_barrier_2(void)
+{
+  const size_t n = 16*1024;
+
+  globals[0] = n/2;
+  locals[0] = 256;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_local_memory_barrier_2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  //OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, locals[0] * 2 * sizeof(uint32_t), NULL);
+
+  // Run the kernel
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  uint32_t *dst = (uint32_t*)buf_data[0];
+  for (uint32_t i = 0; i < n; i+=locals[0])
+    for (uint32_t j = 0; j < locals[0]; ++j)
+        OCL_ASSERT(dst[i+j] == locals[0] - 1 -j);
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_local_memory_barrier_2);
diff --git a/utests/compiler_local_memory_barrier_wg64.cpp b/utests/compiler_local_memory_barrier_wg64.cpp
new file mode 100644
index 0000000..0cb69f5
--- /dev/null
+++ b/utests/compiler_local_memory_barrier_wg64.cpp
@@ -0,0 +1,46 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static void compiler_local_memory_barrier_wg64(void)
+{
+  const size_t n = 1024;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_local_memory_barrier_wg64");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, 256, NULL); // 64 x int
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 64;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  uint32_t *dst = (uint32_t*)buf_data[0];
+  for (uint32_t i = 0; i < n; i+=64)
+  for (uint32_t j = 0; j < 64; ++j)
+    OCL_ASSERT(dst[i+j] == 63-j);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_local_memory_barrier_wg64);
+
diff --git a/utests/compiler_local_memory_two_ptr.cpp b/utests/compiler_local_memory_two_ptr.cpp
new file mode 100644
index 0000000..fde5533
--- /dev/null
+++ b/utests/compiler_local_memory_two_ptr.cpp
@@ -0,0 +1,50 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static void compiler_local_memory_two_ptr(void)
+{
+  const size_t n = 1024;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_local_memory_two_ptr");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, 64, NULL); // 16 x int
+  OCL_SET_ARG(2, 64, NULL); // 16 x int
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  int32_t *dst = (int32_t*)buf_data[0];
+  for (int32_t i = 0; i < (int) n; i+=16)
+  for (int32_t j = 0; j < 16; ++j) {
+    const int gid = i + j;
+    const int tid = j;
+    OCL_ASSERT(dst[i+j] == (gid&~0xf) + 15-tid + 15-tid);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_local_memory_two_ptr);
+
diff --git a/utests/compiler_local_slm.cpp b/utests/compiler_local_slm.cpp
new file mode 100644
index 0000000..3a0c1ed
--- /dev/null
+++ b/utests/compiler_local_slm.cpp
@@ -0,0 +1,33 @@
+#include "utest_helper.hpp"
+
+void compiler_local_slm(void)
+{
+  const size_t n = 32;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_local_slm", "compiler_local_slm");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == (i%16 + 2 + 1+ i/16));
+  OCL_UNMAP_BUFFER(0);
+}
+
+void compiler_local_slm1(void)
+{
+  const size_t n = 2;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_local_slm", "compiler_local_slm1");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = 1;
+  locals[0] = 1;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  uint64_t * ptr = (uint64_t*)buf_data[0];
+  OCL_ASSERT((ptr[1] -ptr[0])  == 4);
+  OCL_UNMAP_BUFFER(0);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_local_slm);
+MAKE_UTEST_FROM_FUNCTION(compiler_local_slm1);
diff --git a/utests/compiler_long.cpp b/utests/compiler_long.cpp
new file mode 100644
index 0000000..b525694
--- /dev/null
+++ b/utests/compiler_long.cpp
@@ -0,0 +1,60 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long(void)
+{
+  const size_t n = 16;
+  int64_t src1[n], src2[n];
+
+  int64_t zero = 0;
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_long), &zero);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  src1[0] = -1L,                  src2[0] = -1L;
+  src1[1] = 0x8000000000000000UL, src2[1] = 0x8000000000000000UL;
+  src1[2] = 0x7FFFFFFFFFFFFFFFL,  src2[2] = 1L;
+  src1[3] = 0xFFFFFFFEL,          src2[3] = 1L;
+  src1[4] = 0x7FFFFFFFL,          src2[4] = 0x80000000L;
+  src1[5] = 0,                    src2[5] = 0;
+  src1[6] = 0,                    src2[6] = 1;
+  src1[7] = -2L,                  src2[7] = -1L;
+  src1[8] = 0,                    src2[8] = 0x8000000000000000UL;
+  for (int32_t i = 9; i < (int32_t) n; ++i) {
+    src1[i] = ((int64_t)rand() << 32) + rand();
+    src2[i] = ((int64_t)rand() << 32) + rand();
+  }
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%lx\n", ((int64_t *)buf_data[2])[i]);
+    if (i < 5)
+      OCL_ASSERT(src1[i] + src2[i] == ((int64_t *)buf_data[2])[i]);
+    if (i > 5)
+      OCL_ASSERT(src1[i] - src2[i] == ((int64_t *)buf_data[2])[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long);
diff --git a/utests/compiler_long_2.cpp b/utests/compiler_long_2.cpp
new file mode 100644
index 0000000..6c5da4b
--- /dev/null
+++ b/utests/compiler_long_2.cpp
@@ -0,0 +1,51 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_2(void)
+{
+  const size_t n = 16;
+  int64_t src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long_2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src1[i] = ((int64_t)rand() << 32) + rand();
+    src2[i] = ((int64_t)rand() << 32) + rand();
+  }
+  src1[4] = 1;
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  int64_t *dest = ((int64_t *)buf_data[2]);
+  //for (int32_t i = 0; i < (int32_t) n; ++i)
+  //  printf("%lx\n", dest[i]);
+  OCL_ASSERT(0xFEDCBA9876543210UL == (uint64_t)dest[0]);
+  OCL_ASSERT((src1[1] & src2[1]) == dest[1]);
+  OCL_ASSERT((src1[2] | src2[2]) == dest[2]);
+  OCL_ASSERT((src1[3] ^ src2[3]) == dest[3]);
+  OCL_ASSERT(0x1122334455667788L == dest[4]);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_2);
diff --git a/utests/compiler_long_asr.cpp b/utests/compiler_long_asr.cpp
new file mode 100644
index 0000000..0a70a23
--- /dev/null
+++ b/utests/compiler_long_asr.cpp
@@ -0,0 +1,41 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_asr(void)
+{
+  const size_t n = 64;
+  int64_t src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long_asr");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    src[i] = (int64_t)1 << 63;
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], src, sizeof(src));
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  int64_t *dest = ((int64_t *)buf_data[1]);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    if (i > 7)
+      OCL_ASSERT(dest[i] == src[i] >> i);
+    else
+      OCL_ASSERT(dest[i] == src[i] + 1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_asr);
diff --git a/utests/compiler_long_cmp.cpp b/utests/compiler_long_cmp.cpp
new file mode 100644
index 0000000..35d4c4f
--- /dev/null
+++ b/utests/compiler_long_cmp.cpp
@@ -0,0 +1,122 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_cmp(void)
+{
+  const size_t n = 16;
+  int64_t src1[n], src2[n];
+
+  src1[0] = (int64_t)1 << 63, src2[0] = 0x7FFFFFFFFFFFFFFFll;
+  src1[1] = (int64_t)1 << 63, src2[1] = ((int64_t)1 << 63) | 1;
+  src1[2] = -1ll, src2[2] = 0;
+  src1[3] = ((int64_t)123 << 32) | 0x7FFFFFFF, src2[3] = ((int64_t)123 << 32) | 0x80000000;
+  src1[4] = 0x7FFFFFFFFFFFFFFFll, src2[4] = (int64_t)1 << 63;
+  src1[5] = ((int64_t)1 << 63) | 1, src2[5] = (int64_t)1 << 63;
+  src1[6] = 0, src2[6] = -1ll;
+  src1[7] = ((int64_t)123 << 32) | 0x80000000, src2[7] = ((int64_t)123 << 32) | 0x7FFFFFFF;
+  for(size_t i=8; i<n; i++) {
+    src1[i] = i;
+    src2[i] = i;
+  }
+
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_l");
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    int64_t *dest = (int64_t *)buf_data[2];
+    int64_t x = (src1[i] < src2[i]) ? 3 : 4;
+    OCL_ASSERT(x == dest[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+  OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
+
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_le");
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    int64_t *dest = (int64_t *)buf_data[2];
+    int64_t x = (src1[i] <= src2[i]) ? 3 : 4;
+    OCL_ASSERT(x == dest[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+  OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
+
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_g");
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    int64_t *dest = (int64_t *)buf_data[2];
+    int64_t x = (src1[i] > src2[i]) ? 3 : 4;
+    OCL_ASSERT(x == dest[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+  OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
+
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_ge");
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    int64_t *dest = (int64_t *)buf_data[2];
+    int64_t x = (src1[i] >= src2[i]) ? 3 : 4;
+    OCL_ASSERT(x == dest[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+  OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
+
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_eq");
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    int64_t *dest = (int64_t *)buf_data[2];
+    int64_t x = (src1[i] == src2[i]) ? 3 : 4;
+    OCL_ASSERT(x == dest[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+  OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
+
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_cmp", "compiler_long_cmp_neq");
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    int64_t *dest = (int64_t *)buf_data[2];
+    int64_t x = (src1[i] != src2[i]) ? 3 : 4;
+    OCL_ASSERT(x == dest[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_cmp);
diff --git a/utests/compiler_long_convert.cpp b/utests/compiler_long_convert.cpp
new file mode 100644
index 0000000..ada6926
--- /dev/null
+++ b/utests/compiler_long_convert.cpp
@@ -0,0 +1,158 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+// convert shorter integer to 64-bit integer
+void compiler_long_convert(void)
+{
+  const size_t n = 16;
+  char src1[n];
+  short src2[n];
+  int src3[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long_convert");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(char), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[4], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[5], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  OCL_SET_ARG(4, sizeof(cl_mem), &buf[4]);
+  OCL_SET_ARG(5, sizeof(cl_mem), &buf[5]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src1[i] = -i;
+    src2[i] = -i;
+    src3[i] = -i;
+  }
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  memcpy(buf_data[2], src3, sizeof(src3));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(3);
+  OCL_MAP_BUFFER(4);
+  OCL_MAP_BUFFER(5);
+  int64_t *dst1 = ((int64_t *)buf_data[3]);
+  int64_t *dst2 = ((int64_t *)buf_data[4]);
+  int64_t *dst3 = ((int64_t *)buf_data[5]);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%lx %lx %lx\n", dst1[i], dst2[i], dst3[i]);
+    OCL_ASSERT(dst1[i] == -(int64_t)i);
+    OCL_ASSERT(dst2[i] == -(int64_t)i);
+    OCL_ASSERT(dst3[i] == -(int64_t)i);
+  }
+  OCL_UNMAP_BUFFER(3);
+  OCL_UNMAP_BUFFER(4);
+  OCL_UNMAP_BUFFER(5);
+}
+
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_long_convert, true);
+
+// convert 64-bit integer to shorter integer
+void compiler_long_convert_2(void)
+{
+  const size_t n = 16;
+  int64_t src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_convert", "compiler_long_convert_2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(char), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src[i] = -i;
+  }
+  OCL_MAP_BUFFER(3);
+  memcpy(buf_data[3], src, sizeof(src));
+  OCL_UNMAP_BUFFER(3);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  char *dst1 = ((char *)buf_data[0]);
+  short *dst2 = ((short *)buf_data[1]);
+  int *dst3 = ((int *)buf_data[2]);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%x %x %x\n", dst1[i], dst2[i], dst3[i]);
+    OCL_ASSERT(dst1[i] == -i);
+    OCL_ASSERT(dst2[i] == -i);
+    OCL_ASSERT(dst3[i] == -i);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_long_convert_2, true);
+
+// convert 64-bit integer to 32-bit float
+void compiler_long_convert_to_float(void)
+{
+  const size_t n = 16;
+  int64_t src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_convert", "compiler_long_convert_to_float");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src[i] = -(int64_t)i;
+  }
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[1], src, sizeof(src));
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  float *dst = ((float *)buf_data[0]);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%f\n", dst[i]);
+    OCL_ASSERT(dst[i] == src[i]);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_convert_to_float);
diff --git a/utests/compiler_long_mult.cpp b/utests/compiler_long_mult.cpp
new file mode 100644
index 0000000..06070f7
--- /dev/null
+++ b/utests/compiler_long_mult.cpp
@@ -0,0 +1,49 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_mult(void)
+{
+  const size_t n = 16;
+  int64_t src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long_mult");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    src1[i] = 0x77665544FFEEDDCCLL;
+    src2[i] = ((int64_t)rand() << 32) + rand();
+  }
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%lx\n", ((int64_t *)buf_data[2])[i]);
+    if (i < 3)
+      OCL_ASSERT(src1[i] + src2[i] == ((int64_t *)buf_data[2])[i]);
+    else
+      OCL_ASSERT(src1[i] * src2[i] == ((int64_t *)buf_data[2])[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_mult);
diff --git a/utests/compiler_long_shl.cpp b/utests/compiler_long_shl.cpp
new file mode 100644
index 0000000..c8e4624
--- /dev/null
+++ b/utests/compiler_long_shl.cpp
@@ -0,0 +1,41 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_shl(void)
+{
+  const size_t n = 64;
+  int64_t src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long_shl");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    src[i] = 1;
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], src, sizeof(src));
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  int64_t *dest = ((int64_t *)buf_data[1]);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    if (i > 7)
+      OCL_ASSERT(dest[i] == ((int64_t)1) << i);
+    else
+      OCL_ASSERT(dest[i] == src[i] + 1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_shl);
diff --git a/utests/compiler_long_shr.cpp b/utests/compiler_long_shr.cpp
new file mode 100644
index 0000000..e9fea6a
--- /dev/null
+++ b/utests/compiler_long_shr.cpp
@@ -0,0 +1,41 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_long_shr(void)
+{
+  const size_t n = 64;
+  uint64_t src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_long_shr");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  // Run random tests
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    src[i] = (uint64_t)1 << 63;
+  OCL_MAP_BUFFER(0);
+  memcpy(buf_data[0], src, sizeof(src));
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  uint64_t *dest = ((uint64_t *)buf_data[1]);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    if (i > 7)
+      OCL_ASSERT(dest[i] == src[i] >> i);
+    else
+      OCL_ASSERT(dest[i] == src[i] + 1);
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_long_shr);
diff --git a/utests/compiler_lower_return0.cpp b/utests/compiler_lower_return0.cpp
new file mode 100644
index 0000000..0e9dbd0
--- /dev/null
+++ b/utests/compiler_lower_return0.cpp
@@ -0,0 +1,54 @@
+#include "utest_helper.hpp"
+
+static void compiler_lower_return0(void)
+{
+  const size_t n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_lower_return0");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+  for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = 2;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // First control flow
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < 32; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == i);
+
+  // Second control flow
+  for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 32; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == -2);
+
+  // Third control flow
+  for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < 8; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == i);
+  for (int32_t i = 8; i < 32; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == -2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_lower_return0);
+
+
diff --git a/utests/compiler_lower_return1.cpp b/utests/compiler_lower_return1.cpp
new file mode 100644
index 0000000..b4f1fe3
--- /dev/null
+++ b/utests/compiler_lower_return1.cpp
@@ -0,0 +1,47 @@
+#include "utest_helper.hpp"
+
+static void compiler_lower_return1(void)
+{
+  const size_t n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_lower_return1");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+  for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = 2;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // First control flow
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < 11; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == i);
+  for (int32_t i = 11; i < 16; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+
+  // Second control flow
+  for (uint32_t i = 0; i < 4; ++i) ((int32_t*)buf_data[0])[i] = -2;
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < 4; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == -2);
+  for (int32_t i = 4; i < 11; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == i);
+  for (int32_t i = 11; i < 16; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_lower_return1);
+
diff --git a/utests/compiler_lower_return2.cpp b/utests/compiler_lower_return2.cpp
new file mode 100644
index 0000000..1e34036
--- /dev/null
+++ b/utests/compiler_lower_return2.cpp
@@ -0,0 +1,48 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+  const int id = global_id;
+  dst[id] = id;
+  while (dst[id] > src[id]) {
+    if (dst[id] > 10) return;
+    dst[id]--;
+  }
+  dst[id] += 2;
+}
+
+static void compiler_lower_return2(void)
+{
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_lower_return2");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < 11; ++i)
+      OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_lower_return2);
+
diff --git a/utests/compiler_mad24.cpp b/utests/compiler_mad24.cpp
new file mode 100644
index 0000000..a3890a1
--- /dev/null
+++ b/utests/compiler_mad24.cpp
@@ -0,0 +1,41 @@
+#include "utest_helper.hpp"
+
+void compiler_mad24(void)
+{
+  const int n = 32;
+  int src1[n], src2[n], src3[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_mad24");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((int*)buf_data[0])[i] = rand();
+    src2[i] = ((int*)buf_data[1])[i] = rand();
+    src3[i] = ((int*)buf_data[2])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(3);
+  for (int i = 0; i < n; ++i)
+    OCL_ASSERT(((int*)buf_data[3])[i] == ((src1[i] << 8) >> 8) * ((src2[i] << 8) >> 8) + src3[i]);
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mad24);
diff --git a/utests/compiler_mad_hi.cpp b/utests/compiler_mad_hi.cpp
new file mode 100644
index 0000000..6f66e7f
--- /dev/null
+++ b/utests/compiler_mad_hi.cpp
@@ -0,0 +1,46 @@
+#include "utest_helper.hpp"
+
+void compiler_mad_hi(void)
+{
+  const int n = 32;
+  int src1[n], src2[n], src3[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_mad_hi");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((int*)buf_data[0])[i] = rand();
+    src2[i] = ((int*)buf_data[1])[i] = rand();
+    src3[i] = ((int*)buf_data[2])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(3);
+  for (int i = 0; i < n; ++i) {
+    long long a = src1[i];
+    a *= src2[i];
+    a >>= 32;
+    a += src3[i];
+    OCL_ASSERT(((int*)buf_data[3])[i] == (int)a);
+  }
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mad_hi);
diff --git a/utests/compiler_mandelbrot.cpp b/utests/compiler_mandelbrot.cpp
new file mode 100644
index 0000000..7758dae
--- /dev/null
+++ b/utests/compiler_mandelbrot.cpp
@@ -0,0 +1,48 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static int *dst = NULL;
+static const size_t w = 256;
+static const size_t h = 256;
+
+static void compiler_mandelbrot(void)
+{
+  const size_t global[2] = {w, h};
+  const size_t local[2] = {16, 1};
+  const size_t sz = w * h * sizeof(char[4]);
+
+  OCL_CREATE_KERNEL("compiler_mandelbrot");
+
+  OCL_CREATE_BUFFER(buf[0], 0, sz, NULL);
+  OCL_CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &buf[0]);
+  OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
+  OCL_MAP_BUFFER(0);
+  dst = (int *) buf_data[0];
+
+  /* Save the image (for debug purpose) */
+  cl_write_bmp(dst, w, h, "compiler_mandelbrot.bmp");
+
+  /* Compare with the golden image */
+  OCL_CHECK_IMAGE(dst, w, h, "compiler_mandelbrot_ref.bmp");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mandelbrot);
+
diff --git a/utests/compiler_mandelbrot_alternate.cpp b/utests/compiler_mandelbrot_alternate.cpp
new file mode 100644
index 0000000..2e5d59f
--- /dev/null
+++ b/utests/compiler_mandelbrot_alternate.cpp
@@ -0,0 +1,54 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static int *dst = NULL;
+static const size_t w = 256;
+static const size_t h = 256;
+static const float criterium = 4.f;
+
+static void compiler_mandelbrot_alternate(void)
+{
+  const size_t global[2] = {w, h};
+  const size_t local[2] = {16, 1};
+  const size_t sz = w * h * sizeof(char[4]);
+  const float rcpWidth = 1.f / float(w);
+  const float rcpHeight = 1.f / float(h);
+
+  OCL_CREATE_KERNEL("compiler_mandelbrot_alternate");
+
+  OCL_CREATE_BUFFER(buf[0], 0, sz, NULL);
+  OCL_CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &buf[0]);
+  OCL_CALL (clSetKernelArg, kernel, 1, sizeof(float), &rcpWidth);
+  OCL_CALL (clSetKernelArg, kernel, 2, sizeof(float), &rcpHeight);
+  OCL_CALL (clSetKernelArg, kernel, 3, sizeof(float), &criterium);
+  OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
+  OCL_MAP_BUFFER(0);
+  dst = (int *) buf_data[0];
+
+  /* Save the image (for debug purpose) */
+  cl_write_bmp(dst, w, h, "compiler_mandelbrot_alternate.bmp");
+
+  /* Compare with the golden image */
+  OCL_CHECK_IMAGE(dst, w, h, "compiler_mandelbrot_alternate_ref.bmp");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mandelbrot_alternate);
+
diff --git a/utests/compiler_math.cpp b/utests/compiler_math.cpp
new file mode 100644
index 0000000..e0c4487
--- /dev/null
+++ b/utests/compiler_math.cpp
@@ -0,0 +1,89 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+static void cpu_compiler_math(float *dst, float *src, int i)
+{
+  const float x = src[i];
+  const float PI = 3.141592653589793f;
+  switch (i) {
+    case 0: dst[i] = cosf(x); break;
+    case 1: dst[i] = sinf(x); break;
+    case 2: dst[i] = log2f(x); break;
+    case 3: dst[i] = sqrtf(x); break;
+    case 4: dst[i] = 1.f/ sqrtf(x); break;
+    case 5: dst[i] = 1.f / x; break;
+    case 6: dst[i] = tanf(x); break;
+    case 7: dst[i] = powf(x, 0.3333333333333333333f); break;
+    case 8: dst[i] = ceilf(x); break;
+    case 9: dst[i] = cosf(PI * x); break;
+    case 10: dst[i] = powf(2, x); break;
+    case 11: dst[i] = powf(10, x); break;
+    case 12: dst[i] = expf(x) - 1; break;
+    case 13: dst[i] = logf(x + 1); break;
+    case 14: dst[i] = floorf(log2f(x)); break;
+    case 15: dst[i] = sinf(PI * x); break;
+    case 16: dst[i] = tanf(PI * x); break;
+    case 17: dst[i] = 2 * roundf(x / 2); break;
+    case 18: dst[i] = sinhf(x); break;
+    case 19: dst[i] = coshf(x); break;
+    case 20: dst[i] = tanhf(x); break;
+    case 21: dst[i] = asinhf(x); break;
+    case 22: dst[i] = acoshf(x); break;
+    case 23: dst[i] = atanhf(x); break;
+    case 24: dst[i] = asinf(x); break;
+    case 25: dst[i] = acosf(x); break;
+    case 26: dst[i] = atanf(x); break;
+    case 27: dst[i] = asinf(x) / PI; break;
+    case 28: dst[i] = acosf(x) / PI; break;
+    case 29: dst[i] = atanf(x) / PI; break;
+    case 30: dst[i] = erff(x); break;
+    case 31: dst[i] = nanf(""); break;
+    default: dst[i] = 1.f; break;
+  };
+}
+
+static void compiler_math(void)
+{
+  const size_t n = 32;
+  float cpu_dst[32], cpu_src[32];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_math");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  int j;
+  for(j = 0; j < 1000; j ++) {
+    OCL_MAP_BUFFER(1);
+    for (uint32_t i = 0; i < 32; ++i)
+      cpu_src[i] = ((float*)buf_data[1])[i] = .1f * (rand() & 15);
+    OCL_UNMAP_BUFFER(1);
+    OCL_NDRANGE(1);
+
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+    for (int i = 0; i < 16; ++i)
+      cpu_compiler_math(cpu_dst, cpu_src, i);
+    for (int i = 0; i < 16; ++i) {
+      const float cpu = cpu_dst[i];
+      const float gpu = ((float*)buf_data[0])[i];
+      if (isinf(cpu))
+        OCL_ASSERT(isinf(gpu));
+      else if (isnan(cpu))
+        OCL_ASSERT(isnan(gpu));
+      else
+        OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
+    }
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_math)
+
+
diff --git a/utests/compiler_math_2op.cpp b/utests/compiler_math_2op.cpp
new file mode 100644
index 0000000..454967d
--- /dev/null
+++ b/utests/compiler_math_2op.cpp
@@ -0,0 +1,80 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+static float rnde(float v) {
+  if(v - floorf(v) > 0.5f)
+    return floorf(v) + 1;
+  if(v - floorf(v) < 0.5f)
+    return floorf(v);
+  if((int)(floorf(v)) & 1)
+    return floorf(v) + 1;
+  return floorf(v);
+}
+
+static void cpu_compiler_math(float *dst, float *src1, float *src2, int i)
+{
+  const float x = src1[i], y = src2[i];
+  switch (i) {
+    case 0: dst[i] = x / y; break;
+    case 1: dst[i] = x > y ? x - y : 0; break;
+    case 2: dst[i] = fminf(x - floorf(x), 0x1.FFFFFep-1F); break;
+    case 3: dst[i] = sqrtf(x*x + y*y); break;
+    case 4: dst[i] = x * powf(2, (int)y); break;
+    case 5: dst[i] = powf(x, (int)y); break;
+    case 6: dst[i] = x - rnde(x/y)*y; break;
+    case 7: dst[i] = powf(x, 1.f/(int)(y+1)); break;
+    case 8: dst[i] = x * y < 0 ? -x : x; break;
+    case 9: dst[i] = fabsf(x) > fabsf(y) ? x : fabsf(y) > fabsf(x) ? y : fmaxf(x, y); break;
+    case 10: dst[i] = fabsf(x) < fabsf(y) ? x : fabsf(y) < fabsf(x) ? y : fminf(x, y); break;
+    default: dst[i] = 1.f; break;
+  };
+}
+
+static void compiler_math_2op(void)
+{
+  const size_t n = 32;
+  float cpu_dst[32], cpu_src1[32], cpu_src2[32];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_math_2op");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  int j;
+  for(j = 0; j < 1000; j ++) {
+    OCL_MAP_BUFFER(1);
+    OCL_MAP_BUFFER(2);
+    for (uint32_t i = 0; i < 32; ++i) {
+      cpu_src1[i] = ((float*)buf_data[1])[i] = .1f * (rand() & 15);
+      cpu_src2[i] = ((float*)buf_data[2])[i] = .1f * (rand() & 15);
+    }
+    OCL_UNMAP_BUFFER(1);
+    OCL_UNMAP_BUFFER(2);
+    OCL_NDRANGE(1);
+
+    for (int i = 0; i < 16; ++i)
+      cpu_compiler_math(cpu_dst, cpu_src1, cpu_src2, i);
+    OCL_MAP_BUFFER(0);
+    for (int i = 0; i < 16; ++i) {
+      const float cpu = cpu_dst[i];
+      const float gpu = ((float*)buf_data[0])[i];
+      if (isinf(cpu))
+        OCL_ASSERT(isinf(gpu));
+      else if (isnan(cpu))
+        OCL_ASSERT(isnan(gpu));
+      else {
+        OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
+      }
+    }
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_math_2op)
diff --git a/utests/compiler_math_3op.cpp b/utests/compiler_math_3op.cpp
new file mode 100644
index 0000000..a382b0a
--- /dev/null
+++ b/utests/compiler_math_3op.cpp
@@ -0,0 +1,64 @@
+#include "utest_helper.hpp"
+#include <cmath>
+#include <algorithm>
+
+static void cpu_compiler_math(float *dst, float *src1, float *src2, float *src3, int i)
+{
+  const float x = src1[i], y = src2[i], z = src3[i];
+  switch (i) {
+    case 0: dst[i] = x * y + z; break;
+    case 1: dst[i] = x * y + z; break;
+    default: dst[i] = 1.f; break;
+  };
+}
+
+static void compiler_math_3op(void)
+{
+  const size_t n = 32;
+  float cpu_dst[32], cpu_src1[32], cpu_src2[32], cpu_src3[32];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_math_3op");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  for (int j = 0; j < 1000; j ++) {
+    OCL_MAP_BUFFER(1);
+    OCL_MAP_BUFFER(2);
+    OCL_MAP_BUFFER(3);
+    for (uint32_t i = 0; i < 32; ++i) {
+      cpu_src1[i] = ((float*)buf_data[1])[i] = .1f * (rand() & 15);
+      cpu_src2[i] = ((float*)buf_data[2])[i] = .1f * (rand() & 15);
+      cpu_src3[i] = ((float*)buf_data[3])[i] = .1f * (rand() & 15);
+    }
+    OCL_UNMAP_BUFFER(1);
+    OCL_UNMAP_BUFFER(2);
+    OCL_UNMAP_BUFFER(3);
+    OCL_NDRANGE(1);
+
+    for (int i = 0; i < 16; ++i)
+      cpu_compiler_math(cpu_dst, cpu_src1, cpu_src2, cpu_src3, i);
+    OCL_MAP_BUFFER(0);
+    for (int i = 0; i < 16; ++i) {
+      const float cpu = cpu_dst[i];
+      const float gpu = ((float*)buf_data[0])[i];
+      if (isinf(cpu))
+        OCL_ASSERT(isinf(gpu));
+      else if (isnan(cpu))
+        OCL_ASSERT(isnan(gpu));
+      else
+        OCL_ASSERT(fabs(gpu-cpu) < 1e-3f);
+    }
+    OCL_UNMAP_BUFFER(0);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_math_3op)
diff --git a/utests/compiler_math_builtin.cpp b/utests/compiler_math_builtin.cpp
new file mode 100644
index 0000000..0577e04
--- /dev/null
+++ b/utests/compiler_math_builtin.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_math_builtin(void)
+{
+  OCL_CREATE_KERNEL("compiler_math_builtin");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_math_builtin);
+
diff --git a/utests/compiler_math_constants.cpp b/utests/compiler_math_constants.cpp
new file mode 100644
index 0000000..5ec97c9
--- /dev/null
+++ b/utests/compiler_math_constants.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_math_constants(void)
+{
+  OCL_CREATE_KERNEL("compiler_math_constants");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_math_constants);
+
diff --git a/utests/compiler_mem_fence.cpp b/utests/compiler_mem_fence.cpp
new file mode 100644
index 0000000..ad7e2f6
--- /dev/null
+++ b/utests/compiler_mem_fence.cpp
@@ -0,0 +1,9 @@
+/* test OpenCL 1.1 Synchronization, explicit memory fence (section 6.11.9, 6.11.10) */
+#include "utest_helper.hpp"
+
+void compiler_mem_fence(void)
+{
+  OCL_CREATE_KERNEL("compiler_mem_fence");
+  OCL_NDRANGE(1);
+}
+
diff --git a/utests/compiler_mixed_pointer.cpp b/utests/compiler_mixed_pointer.cpp
new file mode 100644
index 0000000..9531fb2
--- /dev/null
+++ b/utests/compiler_mixed_pointer.cpp
@@ -0,0 +1,119 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src1, int *src2, int *dst) {
+  int * tmp = NULL;
+
+  switch(global_id) {
+    case 0:
+    case 1:
+    case 4:
+      tmp = src1;
+      break;
+    default:
+      tmp = src2;
+      break;
+  }
+  dst[global_id] = tmp[global_id];
+
+}
+static void cpu1(int global_id, int *src, int *dst1, int *dst2) {
+  int * tmp = global_id < 5 ? dst1 : dst2;
+  tmp[global_id] = src[global_id];
+}
+
+void compiler_mixed_pointer(void)
+{
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16], cpu_src1[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_mixed_pointer");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = i;
+      cpu_src1[i] = ((int32_t*)buf_data[1])[i] = 65536-i;
+    }
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_src1, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(2);
+    for (size_t i = 0; i < n; ++i) {
+//      printf(" %d  %d\n", cpu_dst[i], ((int32_t*)buf_data[2])[i]);
+      OCL_ASSERT(((int32_t*)buf_data[2])[i] == cpu_dst[i]);
+    }
+    OCL_UNMAP_BUFFER(2);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mixed_pointer);
+
+void compiler_mixed_pointer1(void)
+{
+  const size_t n = 16;
+  int cpu_dst1[16], cpu_dst2[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_mixed_pointer", "compiler_mixed_pointer1");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+    OCL_MAP_BUFFER(2);
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = i;
+      cpu_dst1[i] = ((int32_t*)buf_data[1])[i] = 0xff;
+      cpu_dst2[i] = ((int32_t*)buf_data[2])[i] = 0xff;
+    }
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+    OCL_UNMAP_BUFFER(2);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu1(i, cpu_src, cpu_dst1, cpu_dst2);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    OCL_MAP_BUFFER(2);
+    for (size_t i = 0; i < n; ++i) {
+//      printf(" %d  %d\n", cpu_dst1[i], ((int32_t*)buf_data[1])[i]);
+//      printf(" %d  %d\n", ((int32_t*)buf_data[2])[i], cpu_dst2[i]);
+      OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst1[i]);
+      OCL_ASSERT(((int32_t*)buf_data[2])[i] == cpu_dst2[i]);
+    }
+    OCL_UNMAP_BUFFER(1);
+    OCL_UNMAP_BUFFER(2);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mixed_pointer1);
diff --git a/utests/compiler_movforphi_undef.cpp b/utests/compiler_movforphi_undef.cpp
new file mode 100644
index 0000000..8f1e66e
--- /dev/null
+++ b/utests/compiler_movforphi_undef.cpp
@@ -0,0 +1,61 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+static void compiler_movforphi_undef(void)
+{
+  const size_t w = 16;
+  const size_t h = 16;
+  cl_sampler sampler;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_movforphi_undef");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h);
+  for (uint32_t j = 0; j < h; ++j)
+    for (uint32_t i = 0; i < w; i++)
+      ((uint32_t*)buf_data[0])[j * w + i] = j * w + i;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  memset(&desc, 0, sizeof(desc));
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = w * sizeof(uint32_t);
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, buf_data[0]);
+
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[1], 0, &format, &desc, NULL);
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(sampler), &sampler);
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  // Just compare the initial 2 data is enough for this case, as the initial 2 data must in the first
+  // tile box and we can just get the correct coords.
+  for (uint32_t j = 0; j < 1; ++j)
+    for (uint32_t i = 0; i < 3; i++)
+    {
+      if (i == 0)
+        OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i + 1] == ((uint32_t*)buf_data[1])[j * w + i]);
+    }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_movforphi_undef);
diff --git a/utests/compiler_mul24.cpp b/utests/compiler_mul24.cpp
new file mode 100644
index 0000000..8a36947
--- /dev/null
+++ b/utests/compiler_mul24.cpp
@@ -0,0 +1,36 @@
+#include "utest_helper.hpp"
+
+void compiler_mul24(void)
+{
+  const int n = 32;
+  int src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_mul24");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((int*)buf_data[0])[i] = rand();
+    src2[i] = ((int*)buf_data[1])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i)
+    OCL_ASSERT(((int*)buf_data[2])[i] == ((src1[i] << 8) >> 8) * ((src2[i] << 8) >> 8));
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mul24);
diff --git a/utests/compiler_mul_hi.cpp b/utests/compiler_mul_hi.cpp
new file mode 100644
index 0000000..5ea6389
--- /dev/null
+++ b/utests/compiler_mul_hi.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+void compiler_mul_hi(void)
+{
+  const int n = 32;
+  int src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_mul_hi");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((int*)buf_data[0])[i] = rand();
+    src2[i] = ((int*)buf_data[1])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    long long a = src1[i];
+    a *= src2[i];
+    a >>= 32;
+    OCL_ASSERT(((int*)buf_data[2])[i] == (int)a);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_mul_hi);
diff --git a/utests/compiler_multiple_kernels.cpp b/utests/compiler_multiple_kernels.cpp
new file mode 100644
index 0000000..09b4349
--- /dev/null
+++ b/utests/compiler_multiple_kernels.cpp
@@ -0,0 +1,8 @@
+#include "utest_helper.hpp"
+
+static void compiler_multiple_kernels(void)
+{
+	OCL_CREATE_KERNEL_FROM_FILE("compiler_multiple_kernels", "first_kernel");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_multiple_kernels);
\ No newline at end of file
diff --git a/utests/compiler_preprocessor_macros.cpp b/utests/compiler_preprocessor_macros.cpp
new file mode 100644
index 0000000..3cd0272
--- /dev/null
+++ b/utests/compiler_preprocessor_macros.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_preprocessor_macros(void)
+{
+  OCL_CREATE_KERNEL("compiler_preprocessor_macros");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_preprocessor_macros);
+
diff --git a/utests/compiler_private_data_overflow.cpp b/utests/compiler_private_data_overflow.cpp
new file mode 100644
index 0000000..0fa30a0
--- /dev/null
+++ b/utests/compiler_private_data_overflow.cpp
@@ -0,0 +1,15 @@
+#include "utest_helper.hpp"
+
+void compiler_private_data_overflow(void)
+{
+	OCL_CREATE_KERNEL( "compiler_private_data_overflow" );
+	OCL_CREATE_BUFFER( buf[0], 0, sizeof(cl_int4), NULL );
+	OCL_SET_ARG( 0, sizeof(cl_mem), &buf[0] );
+	globals[0] = 64;
+	locals[0] = 32;
+	OCL_NDRANGE(1);
+	OCL_MAP_BUFFER(0);
+	OCL_ASSERT( ((uint32_t *)buf_data[0])[0] == 0 );
+	OCL_UNMAP_BUFFER(0);
+}
+MAKE_UTEST_FROM_FUNCTION( compiler_private_data_overflow );
diff --git a/utests/compiler_program_objects.cpp b/utests/compiler_program_objects.cpp
new file mode 100644
index 0000000..34ae42a
--- /dev/null
+++ b/utests/compiler_program_objects.cpp
@@ -0,0 +1,64 @@
+/* test OpenCL 1.1 Program Objects (section 5.6)
+ * test creating program objects,
+ *      build program executable,
+ *      build options
+ *      query program objects */
+
+#include "utest_helper.hpp"
+
+void compiler_program_objects(void)
+{
+    OCL_CREATE_KERNEL("empty"); // set up global vars
+    OCL_CALL(clRetainProgram, program);
+    OCL_CALL(clReleaseProgram, program);
+    OCL_CALL(clBuildProgram,
+                 program,
+                 1,
+                 &device,
+                 "-Dname -Dname2=def -ldir "
+                 "-cl-opt-disable -cl-strict-aliasing -cl-mad-enable -cl-no-signed-zeros "
+                 "-cl-finite-math-only -cl-fast-relaxed-math -cl-unsafe-math-optimizations "
+                 "-cl-single-precision-constant -cl-denorms-are-zero "
+                 "-w -Werror -cl-std=CL1.1",
+                 NULL,
+                 NULL);
+    const int pi[] = {CL_PROGRAM_REFERENCE_COUNT,
+                      CL_PROGRAM_CONTEXT,
+                      CL_PROGRAM_NUM_DEVICES,
+                      CL_PROGRAM_DEVICES,
+                      CL_PROGRAM_SOURCE,
+                      CL_PROGRAM_BINARY_SIZES,
+                      CL_PROGRAM_BINARIES,};
+    const int pbi[] = {CL_PROGRAM_BUILD_STATUS,
+                       CL_PROGRAM_BUILD_OPTIONS,
+                       CL_PROGRAM_BUILD_LOG,};
+    char param_value[1024];
+    size_t pv_size;
+    int i;
+    for(i=0; i<sizeof(pi) / sizeof(pi[0]); i++)
+        OCL_CALL(clGetProgramInfo,
+                      program,
+                      pi[i],
+                      sizeof(param_value),
+                      param_value,
+                      &pv_size);
+    for(i=0; i<sizeof(pbi) / sizeof(pbi[0]); i++)
+        OCL_CALL(clGetProgramBuildInfo,
+                      program,
+                      device,
+                      pbi[i],
+                      sizeof(param_value),
+                      param_value,
+                      &pv_size);
+    std::cout<<platform<<' '
+             <<device<<' '
+             <<ctx<<' '
+             <<program<<' '
+             <<kernel<<' '
+             <<queue<<std::endl;
+
+    puts("Test clUnloadCompiler");
+    OCL_CALL(clUnloadCompiler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_program_objects);
diff --git a/utests/compiler_radians.cpp b/utests/compiler_radians.cpp
new file mode 100644
index 0000000..882477e
--- /dev/null
+++ b/utests/compiler_radians.cpp
@@ -0,0 +1,32 @@
+#include "utest_helper.hpp"
+
+void compiler_radians(void)
+{
+  const int n = 32;
+  float src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_radians");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  for (int i = 0; i < n; ++i) {
+    src[i] = ((float *)buf_data[0])[i] = rand() * 0.01f;
+  }
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i) {
+    OCL_ASSERT(((float *)buf_data[1])[i] == src[i] * (3.141592653589793F / 180));
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_radians);
diff --git a/utests/compiler_relational_builtin.cpp b/utests/compiler_relational_builtin.cpp
new file mode 100644
index 0000000..a9a6eb5
--- /dev/null
+++ b/utests/compiler_relational_builtin.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_relational_builtin(void)
+{
+  OCL_CREATE_KERNEL("compiler_relational_builtin");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_relational_builtin);
+
diff --git a/utests/compiler_rhadd.cpp b/utests/compiler_rhadd.cpp
new file mode 100644
index 0000000..b25c788
--- /dev/null
+++ b/utests/compiler_rhadd.cpp
@@ -0,0 +1,41 @@
+#include "utest_helper.hpp"
+
+void compiler_rhadd(void)
+{
+  const int n = 32;
+  int src1[n], src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_rhadd");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((int*)buf_data[0])[i] = rand();
+    src2[i] = ((int*)buf_data[1])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    long long a = src1[i];
+    a += src2[i];
+    a ++;
+    a >>= 1;
+    OCL_ASSERT(((int*)buf_data[2])[i] == (int)a);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_rhadd);
diff --git a/utests/compiler_rotate.cpp b/utests/compiler_rotate.cpp
new file mode 100644
index 0000000..bf52ca4
--- /dev/null
+++ b/utests/compiler_rotate.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+int cpu(int src, int y) {
+  return (src << y) | (src >> (32 - y));
+}
+
+void compiler_rotate(void)
+{
+  const int n = 32;
+  int src[n], y[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_rotate");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    src[i] = ((int*)buf_data[0])[i] = rand();
+    y[i] = ((int*)buf_data[2])[i] = rand() & 31;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i)
+    OCL_ASSERT(((int*)buf_data[1])[i] == cpu(src[i], y[i]));
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_rotate);
diff --git a/utests/compiler_sampler.cpp b/utests/compiler_sampler.cpp
new file mode 100644
index 0000000..32bf926
--- /dev/null
+++ b/utests/compiler_sampler.cpp
@@ -0,0 +1,41 @@
+/* test OpenCL 1.1 Sampler Objects (section 5.5) */
+#include "utest_helper.hpp"
+
+void compiler_sampler(void)
+{
+  OCL_CREATE_KERNEL("compiler_sampler");
+
+  OCL_ASSERT(ctx != 0);
+  cl_sampler s;
+  cl_int err;
+  int a1[] = {CL_TRUE, CL_FALSE},
+      a2[] = {CL_ADDRESS_MIRRORED_REPEAT,
+              CL_ADDRESS_REPEAT,
+              CL_ADDRESS_CLAMP_TO_EDGE,
+              CL_ADDRESS_CLAMP,
+              CL_ADDRESS_NONE},
+      a3[] = {CL_FILTER_NEAREST, CL_FILTER_LINEAR},
+      a4[] = {CL_SAMPLER_REFERENCE_COUNT,
+              CL_SAMPLER_CONTEXT,
+              CL_SAMPLER_NORMALIZED_COORDS,
+              CL_SAMPLER_ADDRESSING_MODE,
+              CL_SAMPLER_FILTER_MODE};
+  char pv[1000];
+  size_t pv_size;
+  int i, j, k, l;
+  for(i=0; i<2; i++)
+    for(j=0; j<5; j++)
+      for(k=0; k<2; k++) {
+        s = clCreateSampler(ctx, a1[i], a2[j], a3[k], &err);
+        OCL_ASSERT(err == CL_SUCCESS);
+        OCL_CALL(clRetainSampler, s);
+        OCL_CALL(clReleaseSampler, s);
+        for(l=0; l<5; l++)
+          OCL_CALL(clGetSamplerInfo, s, a4[l], 1000, pv, &pv_size);
+        OCL_CALL(clReleaseSampler, s);
+      }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_sampler);
+
+
diff --git a/utests/compiler_saturate.cpp b/utests/compiler_saturate.cpp
new file mode 100644
index 0000000..6880df0
--- /dev/null
+++ b/utests/compiler_saturate.cpp
@@ -0,0 +1,114 @@
+#include "utest_helper.hpp"
+
+namespace {
+
+constexpr int n = 16;
+
+// declaration only, we should create each template specification for each type.
+template<typename T>
+T get_data(int idx, int part);
+
+/* the format of test data is as follows:
+ *   the first column is A
+ *   the second column is B
+ *   the third column is the expected result.
+ */
+
+#define DEF_TEMPLATE(TYPE, NAME)                                    \
+template <>                                                         \
+TYPE get_data<TYPE>(int idx, int part)                              \
+{                                                                   \
+  static TYPE test_data[n][3] = {                                   \
+    { 0, 0, 0 },                                                    \
+    { 0, 1, 1 },                                                    \
+    { 0, 2, 2 },                                                    \
+    { -1, 1, 0 },                                                   \
+    { 1, -2, -1 },                                                  \
+    { 0, 110, 110 },                                                \
+    { -10, -10, -20 },                                              \
+    { CL_##NAME##_MIN, CL_##NAME##_MIN, CL_##NAME##_MIN },          \
+    { CL_##NAME##_MIN, CL_##NAME##_MAX, -1 },                       \
+    { CL_##NAME##_MAX, 0, CL_##NAME##_MAX },                        \
+    { CL_##NAME##_MAX, 1, CL_##NAME##_MAX },                        \
+    { CL_##NAME##_MAX, 2, CL_##NAME##_MAX },                        \
+    { CL_##NAME##_MAX, CL_##NAME##_MAX, CL_##NAME##_MAX },          \
+    { CL_##NAME##_MAX/2, CL_##NAME##_MAX/2, CL_##NAME##_MAX-1 },    \
+    { CL_##NAME##_MAX/2, CL_##NAME##_MAX/2+1, CL_##NAME##_MAX },    \
+    { CL_##NAME##_MAX/2+1, CL_##NAME##_MAX/2+1, CL_##NAME##_MAX }   \
+  };                                                                \
+  return test_data[idx][part];                                      \
+}                                                                   \
+                                                                    \
+template <>                                                         \
+u##TYPE get_data<u##TYPE>(int idx, int part)                        \
+{                                                                   \
+  static u##TYPE test_data[n][3] = {                                \
+    { 0, 0, 0 },                                                    \
+    { CL_U##NAME##_MAX, 0, CL_U##NAME##_MAX },                      \
+    { CL_U##NAME##_MAX, 1, CL_U##NAME##_MAX },                      \
+    { CL_U##NAME##_MAX, 2, CL_U##NAME##_MAX },                      \
+    { CL_U##NAME##_MAX, CL_U##NAME##_MAX, CL_U##NAME##_MAX },       \
+    { CL_U##NAME##_MAX/2, CL_U##NAME##_MAX/2, CL_U##NAME##_MAX-1 }, \
+    { CL_U##NAME##_MAX/2, CL_U##NAME##_MAX/2+1, CL_U##NAME##_MAX }, \
+    { CL_U##NAME##_MAX/2+1, CL_U##NAME##_MAX/2+1, CL_U##NAME##_MAX }\
+  };                                                                \
+  return test_data[idx][part];                                      \
+}
+
+DEF_TEMPLATE(int8_t, CHAR)
+DEF_TEMPLATE(int16_t, SHRT)
+DEF_TEMPLATE(int32_t, INT)
+//DEF_TEMPLATE(int64_t, LONG)
+
+
+template<typename T>
+void test(const char *kernel_name)
+{
+  T C[n] = { 0 };
+  T A[n] = { 0 };
+  T B[n] = { 0 };
+
+  for (int i = 0; i < n; i++) {
+    A[i] = get_data<T>(i, 0);
+    B[i] = get_data<T>(i, 1);
+  }
+
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_saturate", kernel_name);
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), &C[0]);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(T), &A[0]);
+  OCL_CREATE_BUFFER(buf[2], CL_MEM_COPY_HOST_PTR, n * sizeof(T), &B[0]);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+  globals[0] = n;
+  locals[0] = n;
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(0);
+
+  for (int i = 0; i < n; i++) {
+    OCL_ASSERT(((T*)buf_data[0])[i] == get_data<T>(i, 2));
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+
+}
+
+#define compiler_saturate(type, kernel) \
+static void compiler_saturate_ ##type(void)\
+{\
+  test<type>(# kernel);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_saturate_ ## type);
+
+compiler_saturate(int8_t, test_char)
+compiler_saturate(uint8_t, test_uchar)
+compiler_saturate(int16_t, test_short)
+compiler_saturate(uint16_t, test_ushort)
+compiler_saturate(int32_t, test_int)
+compiler_saturate(uint32_t, test_uint)
+//compiler_saturate(int64_t, test_long)
+//compiler_saturate(uint64_t, test_ulong)
diff --git a/utests/compiler_saturate_sub.cpp b/utests/compiler_saturate_sub.cpp
new file mode 100644
index 0000000..1c95e2d
--- /dev/null
+++ b/utests/compiler_saturate_sub.cpp
@@ -0,0 +1,114 @@
+#include "utest_helper.hpp"
+
+namespace {
+
+constexpr int n = 16;
+
+// declaration only, we should create each template specification for each type.
+template<typename T>
+T get_data(int idx, int part);
+
+/* the format of test data is as follows:
+ *   the first column is A
+ *   the second column is B
+ *   the third column is the expected result.
+ */
+
+#define DEF_TEMPLATE(TYPE, NAME)                                    \
+template <>                                                         \
+TYPE get_data<TYPE>(int idx, int part)                              \
+{                                                                   \
+  static TYPE test_data[n][3] = {                                   \
+    { 0, 0, 0 },                                                    \
+    { 0, 1, -1 },                                                   \
+    { CL_##NAME##_MIN, CL_##NAME##_MIN, 0 },                        \
+    { CL_##NAME##_MAX, CL_##NAME##_MAX, 0 },                        \
+    { -2, CL_##NAME##_MIN, CL_##NAME##_MAX-1 },                     \
+    { -1, CL_##NAME##_MIN, CL_##NAME##_MAX },                       \
+    { 0, CL_##NAME##_MIN, CL_##NAME##_MAX },                        \
+    { 1, CL_##NAME##_MIN, CL_##NAME##_MAX },                        \
+    { -2, CL_##NAME##_MAX, CL_##NAME##_MIN },                       \
+    { -1, CL_##NAME##_MAX, CL_##NAME##_MIN },                       \
+    { 0, CL_##NAME##_MAX, -CL_##NAME##_MAX },                       \
+    { 1, CL_##NAME##_MAX, -CL_##NAME##_MAX+1 },                     \
+    { CL_##NAME##_MIN, CL_##NAME##_MAX, CL_##NAME##_MIN },          \
+    { CL_##NAME##_MIN, 1, CL_##NAME##_MIN },                        \
+    { CL_##NAME##_MIN, -1, CL_##NAME##_MIN+1 },                     \
+    { CL_##NAME##_MAX, CL_##NAME##_MIN, CL_##NAME##_MAX },          \
+  };                                                                \
+  return test_data[idx][part];                                      \
+}                                                                   \
+                                                                    \
+template <>                                                         \
+u##TYPE get_data<u##TYPE>(int idx, int part)                        \
+{                                                                   \
+  static u##TYPE test_data[n][3] = {                                \
+    { 0, 0, 0 },                                                    \
+    { 0, 1, 0 },                                                    \
+    { 1, 1, 0 },                                                    \
+    { 1, 0, 1 },                                                    \
+    { CL_U##NAME##_MAX, CL_U##NAME##_MAX, 0 },                      \
+    { 0, CL_U##NAME##_MAX, 0 },                                     \
+    { 1, CL_U##NAME##_MAX, 0 },                                     \
+    { CL_U##NAME##_MAX, 0, CL_U##NAME##_MAX },                      \
+  };                                                                \
+  return test_data[idx][part];                                      \
+}
+
+DEF_TEMPLATE(int8_t, CHAR)
+DEF_TEMPLATE(int16_t, SHRT)
+DEF_TEMPLATE(int32_t, INT)
+//DEF_TEMPLATE(int64_t, LONG)
+
+
+template<typename T>
+void test(const char *kernel_name)
+{
+  T C[n] = { 0 };
+  T A[n] = { 0 };
+  T B[n] = { 0 };
+
+  for (int i = 0; i < n; i++) {
+    A[i] = get_data<T>(i, 0);
+    B[i] = get_data<T>(i, 1);
+  }
+
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_saturate_sub", kernel_name);
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), &C[0]);
+  OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, n * sizeof(T), &A[0]);
+  OCL_CREATE_BUFFER(buf[2], CL_MEM_COPY_HOST_PTR, n * sizeof(T), &B[0]);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+  globals[0] = n;
+  locals[0] = n;
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(0);
+
+  for (int i = 0; i < n; i++) {
+    OCL_ASSERT(((T*)buf_data[0])[i] == get_data<T>(i, 2));
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+
+}
+
+#define compiler_saturate_sub(type, kernel) \
+static void compiler_saturate_sub_ ##type(void)\
+{\
+  test<type>(# kernel);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_saturate_sub_ ## type);
+
+compiler_saturate_sub(int8_t, test_char)
+compiler_saturate_sub(uint8_t, test_uchar)
+compiler_saturate_sub(int16_t, test_short)
+compiler_saturate_sub(uint16_t, test_ushort)
+compiler_saturate_sub(int32_t, test_int)
+compiler_saturate_sub(uint32_t, test_uint)
+//compiler_saturate_sub(int64_t, test_long)
+//compiler_saturate_sub(uint64_t, test_ulong)
diff --git a/utests/compiler_shader_toy.cpp b/utests/compiler_shader_toy.cpp
new file mode 100644
index 0000000..58bcc6f
--- /dev/null
+++ b/utests/compiler_shader_toy.cpp
@@ -0,0 +1,87 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/* This is a super simple wrapper for the OpenCL kernels I ported from GLSL code
+ * taken in Inigo's web site:
+ * http://www.iquilezles.org/apps/shadertoy/index.html
+ *
+ * They are pretty cool and rather complex kernels. Just the right thing to have
+ * something a bit more complicated and interesting than unit tests.
+ *
+ * The code here is just to wrap the common code used by all the kernels (to run
+ * the code and assert its correctness)
+ */
+#include "utest_helper.hpp"
+
+static const int dim = 256;
+
+// tricky here 'name' stands for Kernel and Reference
+// 'file' stands for .cl file name and dst image name
+static void run_kernel(int w, int h, const char *file, const char *name)
+{
+  const size_t global[2] = {size_t(w), size_t(h)};
+  const size_t local[2] = {16, 1};
+  const size_t sz = w * h * sizeof(char[4]);
+  const float fx = float(w);
+  const float fy = float(h);
+  char kernel_file[256];
+  char dst_img[256];
+  char ref_img[256];
+
+  snprintf(kernel_file, sizeof(kernel_file), "%s.cl", file);
+  snprintf(dst_img, sizeof(dst_img), "%s.bmp", file);
+  snprintf(ref_img, sizeof(ref_img), "%s_ref.bmp", name);
+  OCL_CALL (cl_kernel_init, kernel_file, name, SOURCE, NULL);
+
+  OCL_CREATE_BUFFER(buf[0], 0, sz, NULL);
+  OCL_CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &buf[0]);
+  OCL_CALL (clSetKernelArg, kernel, 1, sizeof(float), &fx);
+  OCL_CALL (clSetKernelArg, kernel, 2, sizeof(float), &fy);
+  OCL_CALL (clSetKernelArg, kernel, 3, sizeof(int), &w);
+  OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
+  OCL_MAP_BUFFER(0);
+  int *dst = (int*) buf_data[0];
+
+  /* Save the image (for debug purpose) */
+  cl_write_bmp(dst, w, h, dst_img);
+
+  /* Compare with the golden image */
+  OCL_CHECK_IMAGE(dst, w, h, ref_img);
+}
+
+#define DECL_SHADER_TOY_TEST(W,H,FILE_NAME, KERNEL_NAME) \
+  static void FILE_NAME(void) { run_kernel(W,H,#FILE_NAME, #KERNEL_NAME); } \
+  MAKE_UTEST_FROM_FUNCTION(FILE_NAME);
+
+DECL_SHADER_TOY_TEST(dim,dim,compiler_clod,compiler_clod);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_ribbon,compiler_ribbon);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_nautilus,compiler_nautilus);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_menger_sponge_no_shadow,compiler_menger_sponge_no_shadow);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_julia,compiler_julia);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_julia_no_break,compiler_julia_no_break);
+// test for function calls
+DECL_SHADER_TOY_TEST(dim,dim,compiler_clod_function_call,compiler_clod);
+DECL_SHADER_TOY_TEST(dim,dim,compiler_julia_function_call,compiler_julia);
+
+// Still issues here for LLVM 3.2
+// DECL_SHADER_TOY_TEST(dim,dim,compiler_chocolux,compiler_chocolux);
+// DECL_SHADER_TOY_TEST(dim,dim,compiler_menger_sponge,compiler_menger_sponge);
+
+#undef DECL_SHADER_TOY_TEST
+
diff --git a/utests/compiler_shift_right.cpp b/utests/compiler_shift_right.cpp
new file mode 100644
index 0000000..b94cc46
--- /dev/null
+++ b/utests/compiler_shift_right.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+typedef unsigned int uint;
+
+static void cpu(int global_id, uint *src, int *dst) {
+  dst[global_id] = src[global_id] >> 24;
+}
+
+void compiler_shift_right(void)
+{
+  const size_t n = 16;
+  uint cpu_src[16];
+  int cpu_dst[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_shift_right");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 8; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((uint*)buf_data[0])[i] = 0x80000000 | rand();
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      OCL_ASSERT(((int *)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_shift_right);
diff --git a/utests/compiler_short_scatter.cpp b/utests/compiler_short_scatter.cpp
new file mode 100644
index 0000000..1746744
--- /dev/null
+++ b/utests/compiler_short_scatter.cpp
@@ -0,0 +1,25 @@
+#include "utest_helper.hpp"
+
+static void compiler_short_scatter(void)
+{
+  const size_t n = 128;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_short_scatter");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int16_t), NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    OCL_ASSERT(((int16_t*)buf_data[0])[i] == (int16_t) i);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_short_scatter);
+
+
diff --git a/utests/compiler_simd_all.cpp b/utests/compiler_simd_all.cpp
new file mode 100644
index 0000000..086c54f
--- /dev/null
+++ b/utests/compiler_simd_all.cpp
@@ -0,0 +1,43 @@
+#include "utest_helper.hpp"
+
+void compiler_simd_all(void)
+{
+  const size_t n = 40;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_simd_all");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  globals[0] = n;
+  locals[0] = 10;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    ((int*)buf_data[0])[i] = i;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    //printf("%d %d\n", i, ((int *)buf_data[1])[i]);
+    if (i % 2 == 1) {
+      if (i < (int32_t)locals[0])
+        OCL_ASSERT(((int *)buf_data[1])[i] == 1);
+      else
+        OCL_ASSERT(((int *)buf_data[1])[i] == 2);
+    }
+    else
+      OCL_ASSERT(((int *)buf_data[1])[i] == 3);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_simd_all);
diff --git a/utests/compiler_simd_any.cpp b/utests/compiler_simd_any.cpp
new file mode 100644
index 0000000..dcc5ef1
--- /dev/null
+++ b/utests/compiler_simd_any.cpp
@@ -0,0 +1,43 @@
+#include "utest_helper.hpp"
+
+void compiler_simd_any(void)
+{
+  const size_t n = 40;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_simd_any");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  globals[0] = n;
+  locals[0] = 10;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    ((int*)buf_data[0])[i] = i;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+
+  // Compare
+  OCL_MAP_BUFFER(1);
+  for (int32_t i = 0; i < (int32_t) n; ++i){
+    //printf("%d %d\n", i, ((int *)buf_data[1])[i]);
+    if (i % 2 == 1) {
+      if (i < (int32_t)locals[0])
+        OCL_ASSERT(((int *)buf_data[1])[i] == 1);
+      else
+        OCL_ASSERT(((int *)buf_data[1])[i] == 2);
+    }
+    else
+      OCL_ASSERT(((int *)buf_data[1])[i] == 3);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_simd_any);
diff --git a/utests/compiler_smoothstep.cpp b/utests/compiler_smoothstep.cpp
new file mode 100644
index 0000000..363ea7e
--- /dev/null
+++ b/utests/compiler_smoothstep.cpp
@@ -0,0 +1,58 @@
+#include <cmath>
+#include "utest_helper.hpp"
+
+float cpu(float e0, float e1, float x)
+{
+  x = (x - e0) / (e1 - e0);
+  if (x >= 1)
+    x = 1.f;
+  if (x <= 0)
+    x = 0.f;
+  return x * x * (3 - 2 * x);
+}
+
+void compiler_smoothstep(void)
+{
+  const int n = 32;
+  float src1[n], src2[n], src3[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_smoothstep");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(float), NULL);
+  OCL_CREATE_BUFFER(buf[3], 0, n * sizeof(float), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(3, sizeof(cl_mem), &buf[3]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i) {
+    float a = 0.1f * (rand() & 15) - 0.75f;
+    float b = a + 0.1f * (rand() & 15) + 0.1f;
+    float c = 0.1f * (rand() & 15) - 0.75f;
+    src1[i] = ((float*)buf_data[0])[i] = a;
+    src2[i] = ((float*)buf_data[1])[i] = b;
+    src3[i] = ((float*)buf_data[2])[i] = c;
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(2);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(3);
+  for (int i = 0; i < n; ++i) {
+    float a = ((float*)buf_data[3])[i];
+    float b = cpu(src1[i], src2[i], src3[i]);
+    OCL_ASSERT(fabsf(a - b) < 1e-4f);
+  }
+  OCL_UNMAP_BUFFER(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_smoothstep);
diff --git a/utests/compiler_step.cpp b/utests/compiler_step.cpp
new file mode 100644
index 0000000..b022826
--- /dev/null
+++ b/utests/compiler_step.cpp
@@ -0,0 +1,342 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+template <typename T, int N>
+struct cl_vec {
+    T ptr[((N+1)/2)*2]; //align to 2 elements.
+
+    typedef cl_vec<T, N> vec_type;
+
+    cl_vec(void) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+    }
+    cl_vec(vec_type & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+    }
+
+    vec_type& operator= (vec_type & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+        return *this;
+    }
+
+    template <typename U> vec_type& operator= (cl_vec<U, N> & other) {
+        memset(ptr, 0, sizeof(T) * ((N+1)/2)*2);
+        memcpy (this->ptr, other.ptr, sizeof(T) * N);
+        return *this;
+    }
+
+    bool operator== (vec_type & other) {
+        return !memcmp (this->ptr, other.ptr, sizeof(T) * N);
+    }
+
+    void step (vec_type & other) {
+        int i = 0;
+        for (; i < N; i++) {
+            T a = ptr[i];
+            T edge = other.ptr[i];
+            T f = a < edge ? 0.0 : 1.0;
+            ptr[i] = f;
+        }
+    }
+
+    void step (float & edge) {
+        int i = 0;
+        for (; i < N; i++) {
+            T a = ptr[i];
+            T f = a < edge ? 0.0 : 1.0;
+            ptr[i] = f;
+        }
+    }
+};
+
+template <typename T, typename U, int N> static void cpu (int global_id,
+        cl_vec<T, N> *edge, cl_vec<T, N> *src, cl_vec<U, N> *dst)
+{
+    cl_vec<T, N> v  = src[global_id];
+    v.step(edge[global_id]);
+    dst[global_id] = v;
+}
+
+template <typename T, typename U> static void cpu(int global_id, T *edge, T *src, U *dst)
+{
+    T f = src[global_id];
+    T e = edge[global_id];
+    f = f < e ? 0.0 : 1.0;
+    dst[global_id] = (U)f;
+}
+
+template <typename T, typename U, int N> static void cpu (int global_id,
+        float edge, cl_vec<T, N> *src, cl_vec<U, N> *dst)
+{
+    cl_vec<T, N> v  = src[global_id];
+    v.step(edge);
+    dst[global_id] = v;
+}
+
+template <typename T, typename U> static void cpu(int global_id, float edge, T *src, U *dst)
+{
+    T f = src[global_id];
+    f = f < edge ? 0.0 : 1.0;
+    dst[global_id] = (U)f;
+}
+
+template <typename T, int N> static void gen_rand_val (cl_vec<T, N>& vect)
+{
+    int i = 0;
+
+    memset(vect.ptr, 0, sizeof(T) * ((N+1)/2)*2);
+    for (; i < N; i++) {
+        vect.ptr[i] = static_cast<T>(.1f * (rand() & 15) - .75f);
+    }
+}
+
+template <typename T> static void gen_rand_val (T & val)
+{
+    val = static_cast<T>(.1f * (rand() & 15) - .75f);
+}
+
+template <typename T>
+inline static void print_data (T& val)
+{
+    if (std::is_unsigned<T>::value)
+        printf(" %u", val);
+    else
+        printf(" %d", val);
+}
+
+inline static void print_data (float& val)
+{
+    printf(" %f", val);
+}
+
+template <typename T, typename U, int N> static void dump_data (cl_vec<T, N>* edge,
+        cl_vec<T, N>* src, cl_vec<U, N>* dst, int n)
+{
+    U* val = reinterpret_cast<U *>(dst);
+
+    n = n*((N+1)/2)*2;
+
+    printf("\nEdge: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+    printf("\nx: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[1])[i]);
+    }
+
+    printf("\nCPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(val[i]);
+    }
+    printf("\nGPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((U *)buf_data[2])[i]);
+    }
+}
+
+template <typename T, typename U> static void dump_data (T* edge, T* src, U* dst, int n)
+{
+    printf("\nedge: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+
+    printf("\nx: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[1])[i]);
+    }
+
+    printf("\nCPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(dst[i]);
+    }
+    printf("\nGPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((U *)buf_data[2])[i]);
+    }
+}
+
+template <typename T, typename U, int N> static void dump_data (float edge,
+        cl_vec<T, N>* src, cl_vec<U, N>* dst, int n)
+{
+    U* val = reinterpret_cast<U *>(dst);
+
+    n = n*((N+1)/2)*2;
+
+    printf("\nEdge: %f\n", edge);
+    printf("\nx: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+
+    printf("\nCPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(val[i]);
+    }
+    printf("\nGPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((U *)buf_data[1])[i]);
+    }
+}
+
+template <typename T, typename U> static void dump_data (float edge, T* src, U* dst, int n)
+{
+    printf("\nedge: %f\n", edge);
+    printf("\nx: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+
+    printf("\nCPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(dst[i]);
+    }
+    printf("\nGPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((U *)buf_data[1])[i]);
+    }
+}
+
+template <typename T> static void compiler_step_with_type(void)
+{
+    const size_t n = 16;
+    T cpu_dst[n], cpu_src[n];
+    T edge[n];
+
+    // Setup buffers
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+    OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(T), NULL);
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+    globals[0] = n;
+    locals[0] = n;
+
+    // Run random tests
+    for (uint32_t pass = 0; pass < 8; ++pass) {
+        OCL_MAP_BUFFER(0);
+        OCL_MAP_BUFFER(1);
+
+        /* Clear the dst buffer to avoid random data. */
+        OCL_MAP_BUFFER(2);
+        memset(buf_data[2], 0, sizeof(T) * n);
+        OCL_UNMAP_BUFFER(2);
+
+        for (int32_t i = 0; i < (int32_t) n; ++i) {
+            gen_rand_val(cpu_src[i]);
+            gen_rand_val(edge[i]);
+        }
+
+        memcpy(buf_data[1], cpu_src, sizeof(T) * n);
+        memcpy(buf_data[0], edge, sizeof(T) * n);
+
+        // Run the kernel on GPU
+        OCL_NDRANGE(1);
+
+        // Run on CPU
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            cpu(i, edge, cpu_src, cpu_dst);
+
+        // Compare
+        OCL_MAP_BUFFER(2);
+
+        //dump_data(edge, cpu_src, cpu_dst, n);
+
+        OCL_ASSERT(!memcmp(buf_data[2], cpu_dst, sizeof(T) * n));
+        OCL_UNMAP_BUFFER(2);
+        OCL_UNMAP_BUFFER(1);
+        OCL_UNMAP_BUFFER(0);
+    }
+}
+
+#define STEP_TEST_TYPE(TYPE) \
+	static void compiler_step_##TYPE (void) \
+        { \
+           OCL_CALL (cl_kernel_init, "compiler_step.cl", "compiler_step_"#TYPE, SOURCE, NULL);  \
+           compiler_step_with_type<TYPE>(); \
+        } \
+	MAKE_UTEST_FROM_FUNCTION(compiler_step_##TYPE);
+
+typedef cl_vec<float, 2> float2;
+typedef cl_vec<float, 3> float3;
+typedef cl_vec<float, 4> float4;
+typedef cl_vec<float, 8> float8;
+typedef cl_vec<float, 16> float16;
+STEP_TEST_TYPE(float)
+STEP_TEST_TYPE(float2)
+STEP_TEST_TYPE(float3)
+STEP_TEST_TYPE(float4)
+STEP_TEST_TYPE(float8)
+STEP_TEST_TYPE(float16)
+
+
+template <typename T> static void compiler_stepf_with_type(void)
+{
+    const size_t n = 16;
+    T cpu_dst[n], cpu_src[n];
+    float edge = (float)(.1f * (rand() & 15) - .75f);
+
+    // Setup buffers
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+    OCL_SET_ARG(0, sizeof(float), &edge);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
+    globals[0] = n;
+    locals[0] = n;
+
+    // Run random tests
+    for (uint32_t pass = 0; pass < 8; ++pass) {
+        OCL_MAP_BUFFER(0);
+
+        /* Clear the dst buffer to avoid random data. */
+        OCL_MAP_BUFFER(1);
+        memset(buf_data[1], 0, sizeof(T) * n);
+        OCL_UNMAP_BUFFER(1);
+
+        for (int32_t i = 0; i < (int32_t) n; ++i) {
+            gen_rand_val(cpu_src[i]);
+        }
+
+        memcpy(buf_data[0], cpu_src, sizeof(T) * n);
+
+        // Run the kernel on GPU
+        OCL_NDRANGE(1);
+
+        // Run on CPU
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            cpu(i, edge, cpu_src, cpu_dst);
+
+        // Compare
+        OCL_MAP_BUFFER(1);
+
+        //dump_data(edge, cpu_src, cpu_dst, n);
+
+        OCL_ASSERT(!memcmp(buf_data[1], cpu_dst, sizeof(T) * n));
+        OCL_UNMAP_BUFFER(1);
+        OCL_UNMAP_BUFFER(0);
+    }
+}
+
+#define _STEPF_TEST_TYPE(TYPE, keep_program) \
+	static void compiler_stepf_##TYPE (void) \
+        { \
+           OCL_CALL (cl_kernel_init, "compiler_step.cl", "compiler_stepf_"#TYPE, SOURCE, NULL);  \
+           compiler_stepf_with_type<TYPE>(); \
+        } \
+	MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_stepf_##TYPE, keep_program);
+
+#define STEPF_TEST_TYPE(TYPE) _STEPF_TEST_TYPE(TYPE, true)
+#define STEPF_TEST_TYPE_END(TYPE) _STEPF_TEST_TYPE(TYPE, false)
+
+
+STEPF_TEST_TYPE(float)
+STEPF_TEST_TYPE(float2)
+STEPF_TEST_TYPE(float3)
+STEPF_TEST_TYPE(float4)
+STEPF_TEST_TYPE(float8)
+STEPF_TEST_TYPE_END(float16)
diff --git a/utests/compiler_structure_attributes.cpp b/utests/compiler_structure_attributes.cpp
new file mode 100644
index 0000000..31656f4
--- /dev/null
+++ b/utests/compiler_structure_attributes.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_structure_attributes(void)
+{
+  OCL_CREATE_KERNEL("compiler_structure_attributes");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_structure_attributes);
+
diff --git a/utests/compiler_switch.cpp b/utests/compiler_switch.cpp
new file mode 100644
index 0000000..6e93309
--- /dev/null
+++ b/utests/compiler_switch.cpp
@@ -0,0 +1,48 @@
+#include "utest_helper.hpp"
+
+static void cpu_compiler_switch(int *dst, int *src, int get_global_id0)
+{
+  switch (get_global_id0) {
+    case 0: dst[get_global_id0] = src[get_global_id0 + 4]; break;
+    case 1: dst[get_global_id0] = src[get_global_id0 + 14]; break;
+    case 2: dst[get_global_id0] = src[get_global_id0 + 13]; break;
+    case 6: dst[get_global_id0] = src[get_global_id0 + 11]; break;
+    case 7: dst[get_global_id0] = src[get_global_id0 + 10]; break;
+    case 10: dst[get_global_id0] = src[get_global_id0 + 9]; break;
+    case 12: dst[get_global_id0] = src[get_global_id0 + 6]; break;
+    default: dst[get_global_id0] = src[get_global_id0 + 8]; break;
+  }
+}
+
+static void compiler_switch(void)
+{
+  const size_t n = 32;
+  int cpu_dst[32], cpu_src[32];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_switch");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 32; ++i)
+    cpu_src[i] = ((int32_t*)buf_data[1])[i] = i;
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < 16; ++i)
+    cpu_compiler_switch(cpu_dst, cpu_src, i);
+  for (int i = 0; i < 16; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[0])[i] == cpu_dst[i]);
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_switch)
+
diff --git a/utests/compiler_type_casting.cpp b/utests/compiler_type_casting.cpp
new file mode 100644
index 0000000..392acf4
--- /dev/null
+++ b/utests/compiler_type_casting.cpp
@@ -0,0 +1,10 @@
+#include "utest_helper.hpp"
+
+void compiler_type_casting(void)
+{
+  OCL_CREATE_KERNEL("compiler_type_casting");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_type_casting);
+
+
diff --git a/utests/compiler_uint16_copy.cpp b/utests/compiler_uint16_copy.cpp
new file mode 100644
index 0000000..1494e81
--- /dev/null
+++ b/utests/compiler_uint16_copy.cpp
@@ -0,0 +1,35 @@
+#include "utest_helper.hpp"
+
+static void compiler_uint16_copy(void)
+{
+  const size_t n = 128;
+
+  // Setup kernel and buffers. Note that uint16 is aligned on 16 bytes
+  // according to the OCL specificatio
+  OCL_CREATE_KERNEL("compiler_uint16_copy");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t[16]) * n);
+  for (uint32_t i = 0; i < n; ++i)
+    for (uint32_t j = 0; j < 16; ++j)
+      ((uint32_t*)buf_data[0])[16*i+j] = 16*i+j;
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t[16]), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t[16]), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 16*n; ++i)
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_uint16_copy);
+
diff --git a/utests/compiler_uint2_copy.cpp b/utests/compiler_uint2_copy.cpp
new file mode 100644
index 0000000..8eb4314
--- /dev/null
+++ b/utests/compiler_uint2_copy.cpp
@@ -0,0 +1,31 @@
+#include "utest_helper.hpp"
+
+static void compiler_uint2_copy(void)
+{
+  const size_t n = 128;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_uint2_copy");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t[2]) * n);
+  for (uint32_t i = 0; i < 2*n; ++i) ((uint32_t*)buf_data[0])[i] = i;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t[2]), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t[2]), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 2*n; ++i)
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_uint2_copy);
+
diff --git a/utests/compiler_uint3_copy.cpp b/utests/compiler_uint3_copy.cpp
new file mode 100644
index 0000000..c4d3cf0
--- /dev/null
+++ b/utests/compiler_uint3_copy.cpp
@@ -0,0 +1,40 @@
+#include "utest_helper.hpp"
+
+static void compiler_uint3_copy(void)
+{
+  const size_t n = 128;
+
+  // Setup kernel and buffers. Note that uint3 is aligned on 16 bytes
+  // according to the OCL specification
+  OCL_CREATE_KERNEL("compiler_uint3_copy");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t[4]) * n);
+  for (uint32_t i = 0; i < n; ++i) {
+    ((uint32_t*)buf_data[0])[4*i+0] = 3*i+0;
+    ((uint32_t*)buf_data[0])[4*i+1] = 3*i+1;
+    ((uint32_t*)buf_data[0])[4*i+2] = 3*i+2;
+  }
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t[4]), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t[4]), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i) {
+    OCL_ASSERT(((uint32_t*)buf_data[0])[4*i+0] == ((uint32_t*)buf_data[1])[4*i+0]);
+    OCL_ASSERT(((uint32_t*)buf_data[0])[4*i+1] == ((uint32_t*)buf_data[1])[4*i+1]);
+    OCL_ASSERT(((uint32_t*)buf_data[0])[4*i+2] == ((uint32_t*)buf_data[1])[4*i+2]);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_uint3_copy);
+
diff --git a/utests/compiler_uint3_unaligned_copy.cpp b/utests/compiler_uint3_unaligned_copy.cpp
new file mode 100644
index 0000000..d42b4c3
--- /dev/null
+++ b/utests/compiler_uint3_unaligned_copy.cpp
@@ -0,0 +1,42 @@
+#include "utest_helper.hpp"
+
+static void compiler_uint3_unaligned_copy(void)
+{
+  const size_t n = 128;
+
+  // Setup kernel and buffers. Note that uint3 is aligned on 16 bytes
+  // according to the OCL specification
+  OCL_CREATE_KERNEL("compiler_uint3_unaligned_copy");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t[4]) * n);
+  for (uint32_t i = 0; i < n; ++i) {
+    ((uint32_t*)buf_data[0])[3*i+0] = 3*i+0;
+    ((uint32_t*)buf_data[0])[3*i+1] = 3*i+1;
+    ((uint32_t*)buf_data[0])[3*i+2] = 3*i+2;
+  }
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t[4]), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t[4]), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i) {
+    OCL_ASSERT(((uint32_t*)buf_data[0])[3*i+0] == ((uint32_t*)buf_data[1])[3*i+0]);
+    OCL_ASSERT(((uint32_t*)buf_data[0])[3*i+1] == ((uint32_t*)buf_data[1])[3*i+1]);
+    OCL_ASSERT(((uint32_t*)buf_data[0])[3*i+2] == ((uint32_t*)buf_data[1])[3*i+2]);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_uint3_unaligned_copy);
+
+
+
diff --git a/utests/compiler_uint8_copy.cpp b/utests/compiler_uint8_copy.cpp
new file mode 100644
index 0000000..25dbd58
--- /dev/null
+++ b/utests/compiler_uint8_copy.cpp
@@ -0,0 +1,35 @@
+#include "utest_helper.hpp"
+
+static void compiler_uint8_copy(void)
+{
+  const size_t n = 128;
+
+  // Setup kernel and buffers. Note that uint8 is aligned on 16 bytes
+  // according to the OCL specification
+  OCL_CREATE_KERNEL("compiler_uint8_copy");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t[8]) * n);
+  for (uint32_t i = 0; i < n; ++i)
+    for (uint32_t j = 0; j < 8; ++j)
+      ((uint32_t*)buf_data[0])[8*i+j] = 8*i+j;
+
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t[8]), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t[8]), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 8*n; ++i)
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_uint8_copy);
+
diff --git a/utests/compiler_unstructured_branch0.cpp b/utests/compiler_unstructured_branch0.cpp
new file mode 100644
index 0000000..128a53e
--- /dev/null
+++ b/utests/compiler_unstructured_branch0.cpp
@@ -0,0 +1,55 @@
+#include "utest_helper.hpp"
+
+static void compiler_unstructured_branch0(void)
+{
+  const size_t n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_unstructured_branch0");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+  for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = 2;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // First control flow
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 16; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+  for (uint32_t i = 16; i < 32; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
+
+  // Second control flow
+  for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 32; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
+
+  // Third control flow
+  for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 8; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+  for (uint32_t i = 8; i < 32; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_unstructured_branch0);
+
diff --git a/utests/compiler_unstructured_branch1.cpp b/utests/compiler_unstructured_branch1.cpp
new file mode 100644
index 0000000..6021f5b
--- /dev/null
+++ b/utests/compiler_unstructured_branch1.cpp
@@ -0,0 +1,54 @@
+#include "utest_helper.hpp"
+
+static void compiler_unstructured_branch1(void)
+{
+  const size_t n = 16;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_unstructured_branch1");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+  for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = 2;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // First control flow
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+
+  // Second control flow
+  for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t*)buf_data[1])[i] == 3);
+
+  // Third control flow
+  for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
+  for (uint32_t i = 8; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 8; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+  for (uint32_t i = 8; i < n; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_unstructured_branch1);
+
diff --git a/utests/compiler_unstructured_branch2.cpp b/utests/compiler_unstructured_branch2.cpp
new file mode 100644
index 0000000..d61c6b5
--- /dev/null
+++ b/utests/compiler_unstructured_branch2.cpp
@@ -0,0 +1,68 @@
+#include "utest_helper.hpp"
+
+static void compiler_unstructured_branch2(void)
+{
+  const size_t n = 16;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_unstructured_branch2");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+  for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = 2;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // First control flow
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 12);
+
+  // Second control flow
+  for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == -6);
+
+  // Third control flow
+  for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
+  for (uint32_t i = 8; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 8; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 12);
+  for (uint32_t i = 8; i < n; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == -6);
+
+  // Fourth control flow
+  for (uint32_t i = 0; i < 4; ++i) ((int32_t*)buf_data[0])[i] = 1;
+  for (uint32_t i = 4; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
+  for (uint32_t i = 8; i < n; ++i) ((int32_t*)buf_data[0])[i] = -2;
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 8; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 12);
+  for (uint32_t i = 8; i < n; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == -6);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_unstructured_branch2);
+
diff --git a/utests/compiler_unstructured_branch3.cpp b/utests/compiler_unstructured_branch3.cpp
new file mode 100644
index 0000000..0c6992a
--- /dev/null
+++ b/utests/compiler_unstructured_branch3.cpp
@@ -0,0 +1,58 @@
+#include "utest_helper.hpp"
+
+static void compiler_unstructured_branch3(void)
+{
+  const size_t n = 16;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_unstructured_branch3");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * n);
+  for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = 2;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+
+  // First control flow
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+
+  // Second control flow
+  for (uint32_t i = 0; i < n; ++i) ((int32_t*)buf_data[0])[i] = 0;
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t*)buf_data[1])[i] == 3);
+
+  // Third control flow
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 8; ++i) ((int32_t*)buf_data[0])[i] = 2;
+  for (uint32_t i = 8; i < n; ++i) ((int32_t*)buf_data[0])[i] = 0;
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 8; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 2);
+  for (uint32_t i = 8; i < n; ++i)
+    OCL_ASSERT(((int32_t*)buf_data[1])[i] == 3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_unstructured_branch3);
+
diff --git a/utests/compiler_upsample_int.cpp b/utests/compiler_upsample_int.cpp
new file mode 100644
index 0000000..ee912f9
--- /dev/null
+++ b/utests/compiler_upsample_int.cpp
@@ -0,0 +1,37 @@
+#include "utest_helper.hpp"
+
+void compiler_upsample_int(void)
+{
+  const int n = 32;
+  short src1[n];
+  unsigned short src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_upsample_int");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(short), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((short*)buf_data[0])[i] = rand();
+    src2[i] = ((short*)buf_data[1])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i)
+    OCL_ASSERT(((int*)buf_data[2])[i] == (int)((src1[i] << 16) | src2[i]));
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_upsample_int);
diff --git a/utests/compiler_upsample_long.cpp b/utests/compiler_upsample_long.cpp
new file mode 100644
index 0000000..b125ff4
--- /dev/null
+++ b/utests/compiler_upsample_long.cpp
@@ -0,0 +1,38 @@
+#include <stdint.h>
+#include "utest_helper.hpp"
+
+void compiler_upsample_long(void)
+{
+  const int n = 32;
+  int src1[n];
+  unsigned int src2[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_upsample_long");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(unsigned int), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (int i = 0; i < n; ++i) {
+    src1[i] = ((int*)buf_data[0])[i] = rand();
+    src2[i] = ((unsigned int*)buf_data[1])[i] = rand();
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(2);
+  for (int i = 0; i < n; ++i)
+    OCL_ASSERT(((int64_t*)buf_data[2])[i] == (((int64_t)(src1[i]) << 32) | src2[i]));
+  OCL_UNMAP_BUFFER(2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_upsample_long);
diff --git a/utests/compiler_vect_compare.cpp b/utests/compiler_vect_compare.cpp
new file mode 100644
index 0000000..e9e45be
--- /dev/null
+++ b/utests/compiler_vect_compare.cpp
@@ -0,0 +1,44 @@
+#include "utest_helper.hpp"
+
+typedef struct {
+  int x;
+  int y;
+  int z;
+  int w;
+} int4;
+
+void compiler_vect_compare(void)
+{
+  const size_t n = 16;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_vect_compare");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int4), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int4), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i) {
+    ((int4*)buf_data[0])[i].x = i & 0x1;
+    ((int4*)buf_data[0])[i].y = i & 0x2;
+    ((int4*)buf_data[0])[i].z = i & 0x4;
+    ((int4*)buf_data[0])[i].w = i & 0x8;
+  }
+  OCL_UNMAP_BUFFER(0);
+
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < 16; ++i) {
+    OCL_ASSERT(((int4*)buf_data[1])[i].x == (int)((i&0x1)?0xffffffff:0));
+    OCL_ASSERT(((int4*)buf_data[1])[i].y == (int)((i&0x2)?0xffffffff:0));
+    OCL_ASSERT(((int4*)buf_data[1])[i].z == (int)((i&0x4)?0xffffffff:0));
+    OCL_ASSERT(((int4*)buf_data[1])[i].w == (int)((i&0x8)?0xffffffff:0));
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_vect_compare);
diff --git a/utests/compiler_vector_inc.cpp b/utests/compiler_vector_inc.cpp
new file mode 100644
index 0000000..c44424b
--- /dev/null
+++ b/utests/compiler_vector_inc.cpp
@@ -0,0 +1,46 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+
+void compiler_vector_inc(void)
+{
+  const int n = 64;
+  char dst[n];
+  char src[n];
+
+  OCL_CREATE_KERNEL("compiler_vector_inc");
+  OCL_CREATE_BUFFER(buf[0], 0, n, NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n, NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n / 2;
+  locals[0] = 16;
+
+  for (int i = 0; i < n; ++i) {
+    dst[i] = i;
+    src[i] = (i / 2) % 4;
+  }
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], dst, n);
+  memcpy(buf_data[1], src, n);
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(0);
+  char *dest = ((char *)buf_data[0]);
+  for (int i=0; i<n; ++i) {
+    char wish;
+    if (src[i/2] < 2)
+      wish = dst[i] + 1;
+    else
+      wish = dst[i] - 1;
+    OCL_ASSERT(dest[i] == wish);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_vector_inc);
diff --git a/utests/compiler_vector_load_store.cpp b/utests/compiler_vector_load_store.cpp
new file mode 100644
index 0000000..5a1a8d1
--- /dev/null
+++ b/utests/compiler_vector_load_store.cpp
@@ -0,0 +1,63 @@
+#include "utest_helper.hpp"
+#include <string.h>
+template<typename T>
+static void compiler_vector_load_store(int elemNum, const char *kernelName)
+{
+  const size_t n = elemNum * 256;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_vector_load_store", kernelName);
+  buf_data[0] = (T*) malloc(sizeof(T) * n);
+  for (uint32_t i = 0; i < n; ++i)
+    ((T*)buf_data[0])[i] = i;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n / elemNum;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Check result
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i)
+  {
+    int shift = ((i % elemNum) + 1);
+    if (strstr(kernelName, "double") == NULL)
+      OCL_ASSERT(((T*)buf_data[1])[i] == (T)(((T*)buf_data[0])[i] + shift));
+    else
+      OCL_ASSERT((((T*)buf_data[1])[i] - ((T)((T*)buf_data[0])[i] + shift)) < 1e-5);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+}
+
+#define compiler_vector_load_store(type, n, kernel_type, keep_program) \
+static void compiler_vector_ ##kernel_type ##n ##_load_store(void)\
+{\
+  compiler_vector_load_store<type>(n, "test_" #kernel_type #n);\
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(compiler_vector_ ## kernel_type ##n ##_load_store, keep_program);
+
+#define test_all_vector(type, kernel_type, keep_program) \
+  compiler_vector_load_store(type, 2, kernel_type, true) \
+  compiler_vector_load_store(type, 3, kernel_type, true) \
+  compiler_vector_load_store(type, 4, kernel_type, true) \
+  compiler_vector_load_store(type, 8, kernel_type, true) \
+  compiler_vector_load_store(type, 16, kernel_type, keep_program)
+
+test_all_vector(int8_t, char, true)
+test_all_vector(uint8_t, uchar, true)
+test_all_vector(int16_t, short, true)
+test_all_vector(uint16_t, ushort, true)
+test_all_vector(int32_t, int, true)
+test_all_vector(uint32_t, uint, true)
+test_all_vector(float, float, true)
+//test_all_vector(double, double, true)
+test_all_vector(int64_t, long, true)
+test_all_vector(uint64_t, ulong, false)
diff --git a/utests/compiler_volatile.cpp b/utests/compiler_volatile.cpp
new file mode 100644
index 0000000..f4fe054
--- /dev/null
+++ b/utests/compiler_volatile.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_volatile(void)
+{
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_volatile");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_volatile);
diff --git a/utests/compiler_workitem_builtin.cpp b/utests/compiler_workitem_builtin.cpp
new file mode 100644
index 0000000..092b0e7
--- /dev/null
+++ b/utests/compiler_workitem_builtin.cpp
@@ -0,0 +1,9 @@
+#include "utest_helper.hpp"
+
+void compiler_workitem_builtin(void)
+{
+  OCL_CREATE_KERNEL("compiler_workitem_builtin");
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_workitem_builtin);
+
diff --git a/utests/compiler_write_only.cpp b/utests/compiler_write_only.cpp
new file mode 100644
index 0000000..3935535
--- /dev/null
+++ b/utests/compiler_write_only.cpp
@@ -0,0 +1,43 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static void compiler_write_only(void)
+{
+  const size_t n = 2048;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("test_write_only");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_write_only);
+
diff --git a/utests/compiler_write_only_bytes.cpp b/utests/compiler_write_only_bytes.cpp
new file mode 100644
index 0000000..1a13cdb
--- /dev/null
+++ b/utests/compiler_write_only_bytes.cpp
@@ -0,0 +1,23 @@
+#include "utest_helper.hpp"
+
+void compiler_write_only_bytes(void)
+{
+  const size_t n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_write_only_bytes");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint8_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint8_t*)buf_data[0])[i] == 2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_write_only_bytes);
diff --git a/utests/compiler_write_only_shorts.cpp b/utests/compiler_write_only_shorts.cpp
new file mode 100644
index 0000000..19988fe
--- /dev/null
+++ b/utests/compiler_write_only_shorts.cpp
@@ -0,0 +1,24 @@
+#include "utest_helper.hpp"
+
+void compiler_write_only_shorts(void)
+{
+  const size_t n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_write_only_shorts");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint16_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint16_t*)buf_data[0])[i] == 2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_write_only_shorts);
+
diff --git a/utests/enqueue_built_in_kernels.cpp b/utests/enqueue_built_in_kernels.cpp
new file mode 100644
index 0000000..52b8848
--- /dev/null
+++ b/utests/enqueue_built_in_kernels.cpp
@@ -0,0 +1,19 @@
+#include "utest_helper.hpp"
+
+void enqueue_built_in_kernels(void)
+{
+  char* built_in_kernel_names;
+  size_t built_in_kernels_size;
+  cl_int err = CL_SUCCESS;
+  size_t ret_sz;
+
+
+  OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, 0, 0, &built_in_kernels_size);
+  built_in_kernel_names = (char* )malloc(built_in_kernels_size * sizeof(char) );
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, built_in_kernels_size, (void*)built_in_kernel_names, &ret_sz);
+  OCL_ASSERT(ret_sz == built_in_kernels_size);
+  cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, &device, built_in_kernel_names, &err);
+  OCL_ASSERT(built_in_prog != NULL);
+}
+
+MAKE_UTEST_FROM_FUNCTION(enqueue_built_in_kernels);
diff --git a/utests/enqueue_copy_buf.cpp b/utests/enqueue_copy_buf.cpp
new file mode 100644
index 0000000..b647b7e
--- /dev/null
+++ b/utests/enqueue_copy_buf.cpp
@@ -0,0 +1,66 @@
+#include "utest_helper.hpp"
+
+static void test_copy_buf(size_t sz, size_t src_off, size_t dst_off, size_t cb)
+{
+    unsigned int i;
+    OCL_MAP_BUFFER(0);
+
+    for (i=0; i < sz; i++) {
+        ((char*)buf_data[0])[i] = (rand() & 63);
+    }
+
+    OCL_UNMAP_BUFFER(0);
+
+    if (src_off + cb > sz || dst_off + cb > sz) {
+        /* Expect Error. */
+        OCL_ASSERT(clEnqueueCopyBuffer(queue, buf[0], buf[1],
+                                       src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+        return;
+    }
+
+    OCL_ASSERT(!clEnqueueCopyBuffer(queue, buf[0], buf[1],
+                                    src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+
+#if 0
+    printf("\n########### Src buffer: \n");
+    for (i = 0; i < cb; ++i)
+        printf(" %2.2u", ((unsigned char*)buf_data[0])[i + src_off]);
+
+    printf("\n########### dst buffer: \n");
+    for (i = 0; i < cb; ++i)
+        printf(" %2.2u", ((unsigned char*)buf_data[1])[i + dst_off]);
+#endif
+
+    // Check results
+    for (i = 0; i < cb; ++i) {
+        if (((char*)buf_data[0])[i + src_off] != ((char*)buf_data[1])[i + dst_off]) {
+            printf ("different index is %d\n", i);
+            OCL_ASSERT(0);
+        }
+    }
+
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+
+}
+
+void enqueue_copy_buf(void)
+{
+    size_t i;
+    size_t j;
+    const size_t sz = 1024;
+
+    OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(char), NULL);
+
+    for (i=0; i<sz; i+=7) {
+        for (j=0; j<sz; j+=10) {
+            test_copy_buf(sz, i, j, sz/2);
+        }
+    }
+}
+
+MAKE_UTEST_FROM_FUNCTION(enqueue_copy_buf);
diff --git a/utests/enqueue_copy_buf_unaligned.cpp b/utests/enqueue_copy_buf_unaligned.cpp
new file mode 100644
index 0000000..e1bd0aa
--- /dev/null
+++ b/utests/enqueue_copy_buf_unaligned.cpp
@@ -0,0 +1,118 @@
+#include "utest_helper.hpp"
+
+static void test_copy_buf(size_t sz, size_t src_off, size_t dst_off, size_t cb)
+{
+    unsigned int i;
+    OCL_MAP_BUFFER(0);
+
+    for (i=0; i < sz; i++) {
+        ((char*)buf_data[0])[i] = (rand() & 31);
+    }
+
+    OCL_UNMAP_BUFFER(0);
+
+    OCL_MAP_BUFFER(1);
+
+    for (i=0; i < sz; i++) {
+        ((char*)buf_data[1])[i] = 64;
+    }
+
+    OCL_UNMAP_BUFFER(1);
+
+    if (src_off + cb > sz || dst_off + cb > sz) {
+        /* Expect Error. */
+        OCL_ASSERT(clEnqueueCopyBuffer(queue, buf[0], buf[1],
+                                       src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+        return;
+    }
+
+    OCL_ASSERT(!clEnqueueCopyBuffer(queue, buf[0], buf[1],
+                                    src_off, dst_off, cb*sizeof(char), 0, NULL, NULL));
+
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+
+#if 0
+    printf ("@@@@@@@@@ cb is %d\n", cb);
+    printf ("@@@@@@@@@ src_off is %d\n", src_off);
+    printf ("@@@@@@@@@ dst_off is %d\n", dst_off);
+    printf("\n########### Src buffer: \n");
+    for (i = 0; i < sz; ++i)
+        printf(" %2.2u", ((unsigned char*)buf_data[0])[i]);
+
+    printf("\n########### dst buffer: \n");
+    for (i = 0; i < sz; ++i)
+        printf(" %2.2u", ((unsigned char*)buf_data[1])[i]);
+#endif
+
+    // Check results
+    for (i = 0; i < cb; ++i) {
+        if (((char*)buf_data[0])[i +src_off] != ((char*)buf_data[1])[i + dst_off]) {
+            printf ("different index is %d\n", i);
+            OCL_ASSERT(0);
+        }
+    }
+
+    for (i = 0; i < dst_off; ++i) {
+        if (((char*)buf_data[1])[i] != 64) {
+            printf ("wrong write, different index is %d\n", i);
+            OCL_ASSERT(0);
+        }
+    }
+
+    for (i = dst_off + cb; i < sz; ++i) {
+        if (((char*)buf_data[1])[i] != 64) {
+            printf ("wrong write, different index is %d\n", i);
+            OCL_ASSERT(0);
+        }
+    }
+
+    OCL_UNMAP_BUFFER(0);
+    OCL_UNMAP_BUFFER(1);
+
+}
+
+void enqueue_copy_buf_unaligned(void)
+{
+    size_t i;
+    size_t j;
+    const size_t sz = 1024;
+    int offset = 0;
+
+    OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(char), NULL);
+
+#if 1
+    /* Test the same offset cases. */
+    for (i=0; i<sz; i+=32) {
+        for (j=64; j<sz; j+=32) {
+	    offset = (rand() & 3);
+            test_copy_buf(sz, i + offset, j + offset, ((rand() & 31) + 1));
+        }
+    }
+#endif
+
+#if 1
+    /* Test the dst small offset cases. */
+    for (i=0; i<sz; i+=32) {
+        for (j=64; j<sz; j+=32) {
+	    offset = (rand() & 2);
+            test_copy_buf(sz, i + offset + 1, j + offset, ((rand() & 31) + 1));
+        }
+    }
+#endif
+
+#if 1
+    /* Test the dst big offset cases. */
+    for (i=0; i<sz; i+=32) {
+        for (j=64; j<sz; j+=32) {
+	    offset = (rand() & 2);
+            test_copy_buf(sz, i + offset, j + offset + 1, ((rand() & 31) + 1));
+        }
+    }
+#endif
+//            test_copy_buf(sz, 0, 1, 17);
+
+}
+
+MAKE_UTEST_FROM_FUNCTION(enqueue_copy_buf_unaligned);
diff --git a/utests/enqueue_fill_buf.cpp b/utests/enqueue_fill_buf.cpp
new file mode 100644
index 0000000..272b81f
--- /dev/null
+++ b/utests/enqueue_fill_buf.cpp
@@ -0,0 +1,90 @@
+#include "utest_helper.hpp"
+#include <string.h>
+
+static char pattern_serials[128];
+
+static void test_fill_buf(size_t sz, size_t offset, size_t size, size_t pattern_sz)
+{
+  unsigned int i;
+  int ret = 0;
+  OCL_MAP_BUFFER(0);
+  memset(((char*)buf_data[0]), 0, sz);
+  OCL_UNMAP_BUFFER(0);
+
+  for (i=0; i < pattern_sz; i++) {
+    pattern_serials[i] = (rand() & 63);
+  }
+
+  if (offset + size > sz) {
+    /* Expect Error. */
+    OCL_ASSERT(clEnqueueFillBuffer(queue, buf[0], pattern_serials,
+                                   pattern_sz, offset, size, 0, NULL, NULL));
+    return;
+  }
+
+  ret = clEnqueueFillBuffer(queue, buf[0], pattern_serials,
+                            pattern_sz, offset, size, 0, NULL, NULL);
+  OCL_ASSERT(!ret);
+
+  OCL_MAP_BUFFER(0);
+
+#if 0
+  printf("\n==== pattern size is %d, offset is %d, size is %d ====\n",
+         pattern_sz, offset, size);
+  printf("\n###########  buffer: \n");
+  for (i = 0; i < sz; ++i)
+    printf(" %2.2u", ((unsigned char*)buf_data[0])[i]);
+
+#endif
+
+  // Check results
+  int j = 0;
+  for (i = 0; i < sz; ++i) {
+    if (i < offset || i >= offset + size) {
+      if (((char*)buf_data[0])[i] != 0) {
+        printf ("\nnon zero index is %d\n", i);
+        OCL_ASSERT(0);
+      }
+      continue;
+    }
+
+    if (((char*)buf_data[0])[i] != pattern_serials[j]) {
+      printf ("\ndifferent index is %d\n", i);
+      OCL_ASSERT(0);
+    }
+    j++;
+    if (j == (int)pattern_sz) j = 0;
+  }
+
+  OCL_UNMAP_BUFFER(0);
+
+}
+
+void enqueue_fill_buf(void)
+{
+  size_t offset;
+  size_t pattern_sz;
+  const size_t sz = 1024;
+  size_t size = 0;
+  static int valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
+  unsigned int i = 0;
+
+  OCL_CREATE_BUFFER(buf[0], 0, sz * sizeof(char), NULL);
+
+  for (i = 0; i < sizeof(valid_sz)/sizeof(int); i++) {
+
+	pattern_sz = valid_sz[i];
+	size = ((rand()%1024)/pattern_sz) * pattern_sz;
+	offset = ((rand()%1024)/pattern_sz) * pattern_sz;
+	while (size + offset + 1 > sz) {
+      if (size > offset) {
+        size = size - offset;
+      } else
+        offset = offset - size;
+	}
+
+	test_fill_buf(sz, offset, size, pattern_sz);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(enqueue_fill_buf);
diff --git a/utests/get_arg_info.cpp b/utests/get_arg_info.cpp
new file mode 100644
index 0000000..c1ea1ef
--- /dev/null
+++ b/utests/get_arg_info.cpp
@@ -0,0 +1,85 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+void test_get_arg_info(void)
+{
+  int ret;
+  uint32_t ret_val;
+  cl_kernel_arg_type_qualifier type_qual;
+  size_t ret_sz;
+  char name[64];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("test_get_arg_info");
+
+  //Arg 0
+  ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_ADDRESS_QUALIFIER,
+                           sizeof(ret_val), &ret_val, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_address_qualifier));
+  OCL_ASSERT(ret_val == CL_KERNEL_ARG_ADDRESS_GLOBAL);
+
+  ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_ACCESS_QUALIFIER,
+                           sizeof(ret_val), &ret_val, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_access_qualifier));
+  OCL_ASSERT(ret_val == CL_KERNEL_ARG_ACCESS_NONE);
+
+  ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_TYPE_NAME,
+                           sizeof(name), name, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == strlen("float*") + 1);
+  OCL_ASSERT(!strcmp(name, "float*"));
+
+  ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_NAME,
+                           sizeof(name), name, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == strlen("src") + 1);
+  OCL_ASSERT(!strcmp(name, "src"));
+
+  ret = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_TYPE_QUALIFIER,
+                           sizeof(type_qual), &type_qual, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_type_qualifier));
+  OCL_ASSERT(type_qual == (CL_KERNEL_ARG_TYPE_CONST|CL_KERNEL_ARG_TYPE_VOLATILE));
+
+  //Arg 1
+  ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_ADDRESS_QUALIFIER,
+                           sizeof(ret_val), &ret_val, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_address_qualifier));
+  OCL_ASSERT(ret_val == CL_KERNEL_ARG_ADDRESS_LOCAL);
+
+  ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_ACCESS_QUALIFIER,
+                           sizeof(ret_val), &ret_val, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_access_qualifier));
+  OCL_ASSERT(ret_val == CL_KERNEL_ARG_ACCESS_NONE);
+
+  ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_TYPE_NAME,
+                           sizeof(name), name, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == strlen("int*") + 1);
+  OCL_ASSERT(!strcmp(name, "int*"));
+
+  ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_NAME,
+                           sizeof(name), name, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == strlen("dst") + 1);
+  OCL_ASSERT(!strcmp(name, "dst"));
+
+  ret = clGetKernelArgInfo(kernel, 1, CL_KERNEL_ARG_TYPE_QUALIFIER,
+                           sizeof(type_qual), &type_qual, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == sizeof(cl_kernel_arg_type_qualifier));
+  OCL_ASSERT(type_qual == CL_KERNEL_ARG_TYPE_NONE);
+
+  //Arg 2
+  ret = clGetKernelArgInfo(kernel, 2, CL_KERNEL_ARG_TYPE_NAME,
+                           sizeof(name), name, &ret_sz);
+  OCL_ASSERT(ret == CL_SUCCESS);
+  OCL_ASSERT(ret_sz == strlen("test_arg_struct") + 1);
+  OCL_ASSERT(!strcmp(name, "test_arg_struct"));
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_get_arg_info);
diff --git a/utests/get_cl_info.cpp b/utests/get_cl_info.cpp
new file mode 100644
index 0000000..807739b
--- /dev/null
+++ b/utests/get_cl_info.cpp
@@ -0,0 +1,641 @@
+#include <string.h>
+#include <string>
+#include <map>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include "utest_helper.hpp"
+
+using namespace std;
+
+/* ***************************************************** *
+ * This file to test all the API like: clGetXXXXInfo     *
+ * ***************************************************** */
+#define NO_STANDARD_REF 0xFFFFF
+
+template <typename T = cl_uint>
+struct Info_Result {
+    T ret;
+    T refer;
+    int size;
+    typedef T type_value;
+
+    void * get_ret(void) {
+        return (void *)&ret;
+    }
+
+    Info_Result(T other) {
+        refer = other;
+        size = sizeof(T);
+    }
+
+    bool check_result (void) {
+        //printf("The refer is %d, we get result is %d\n", refer, ret);
+        if (ret != refer && refer != (T)NO_STANDARD_REF)
+            return false;
+
+        return true;
+    }
+};
+
+template <>
+struct Info_Result<char *> {
+    char * ret;
+    char * refer;
+    int size;
+    typedef char* type_value;
+
+    Info_Result(const char *other, int sz): refer(NULL) {
+        size = sz;
+        ret = (char *)malloc(sizeof(char) * sz);
+        if (other) {
+            refer = (char *)malloc(sizeof(char) * sz);
+            memcpy(refer, other, sz);
+        }
+    }
+
+    ~Info_Result(void) {
+        free(refer);
+        free(ret);
+    }
+
+    void * get_ret(void) {
+        return (void *)ret;
+    }
+
+    bool check_result (void) {
+        if (refer && ::memcmp(ret, refer, size))
+            return false;
+
+        return true;
+    }
+};
+
+template <> //Used for such as CL_PROGRAM_BINARIES
+struct Info_Result<char **> {
+    char ** ret;
+    char ** refer;
+    int *elt_size;
+    int size;
+    typedef char** type_value;
+
+    Info_Result(char **other, int *sz, int elt_num) {
+        size = elt_num;
+
+        ret = (char **)malloc(elt_num * sizeof(char *));
+        memset(ret, 0, (elt_num * sizeof(char *)));
+        refer = (char **)malloc(elt_num * sizeof(char *));
+        memset(refer, 0, (elt_num * sizeof(char *)));
+        elt_size = (int *)malloc(elt_num * sizeof(int));
+        memset(elt_size, 0, (elt_num * sizeof(int)));
+        if (sz) {
+            int i = 0;
+            for (; i < elt_num; i++) {
+                elt_size[i] = sz[i];
+                ret[i] = (char *)malloc(sz[i] * sizeof(char));
+
+                if (other[i] && elt_size[i] > 0) {
+                    refer[i] = (char *)malloc(sz[i] * sizeof(char));
+                    memcpy(&refer[i], &other[i], sz[i]);
+                }
+                else
+                    refer[i] = NULL;
+            }
+        }
+    }
+
+    ~Info_Result(void) {
+        int i = 0;
+        for (; i < size; i++) {
+            if (refer[i])
+                free(refer[i]);
+            free(ret[i]);
+        }
+        free(ret);
+        free(refer);
+        free(elt_size);
+    }
+
+    void * get_ret(void) {
+        return (void *)ret;
+    }
+
+    bool check_result (void) {
+        int i = 0;
+        for (; i < size; i++) {
+            if (refer[i] && ::memcmp(ret[i], refer[i], elt_size[i]))
+                return false;
+        }
+
+        return true;
+    }
+};
+
+template <typename T1, typename T2>
+struct Traits {
+    static bool Is_Same(void) {
+        return false;
+    };
+};
+
+template <typename T1>
+struct Traits<T1, T1> {
+    static bool Is_Same(void) {
+        return true;
+    };
+};
+
+template <typename T>
+Info_Result<T>* cast_as(void *info)
+{
+    Info_Result<T>* ret;
+    ret = reinterpret_cast<Info_Result<T>*>(info);
+    OCL_ASSERT((Traits<T, typename Info_Result<T>::type_value>::Is_Same()));
+    return ret;
+}
+
+
+#define CALL_INFO_AND_RET(TYPE, FUNC, ...) \
+    do { \
+	cl_int ret; \
+	size_t ret_size; \
+	\
+	Info_Result<TYPE>* info = cast_as<TYPE>(x.second); \
+	ret = FUNC (__VA_ARGS__, x.first, \
+		info->size, info->get_ret(), &ret_size); \
+	OCL_ASSERT((!ret)); \
+	OCL_ASSERT((info->check_result())); \
+	delete info; \
+    } while(0)
+
+/* ***************************************************** *
+ * clGetProgramInfo                                      *
+ * ***************************************************** */
+#define CALL_PROGINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetProgramInfo, program)
+
+void get_program_info(void)
+{
+    map<cl_program_info, void *> maps;
+    int expect_value;
+    char * expect_source;
+    int sz;
+    char *ker_path = (char *)malloc(4096 * sizeof(char));
+    const char *kiss_path = getenv("OCL_KERNEL_PATH");
+    string line;
+    string source_code;
+
+    sprintf(ker_path, "%s/%s", kiss_path, "compiler_if_else.cl");
+
+    ifstream in(ker_path);
+    while (getline(in,line)) {
+        source_code = (source_code == "") ?
+                      source_code + line : source_code + "\n" + line;
+    }
+    free(ker_path);
+    //cout<< source_code;
+    source_code = source_code + "\n";
+
+    expect_source = (char *)source_code.c_str();
+
+    OCL_CREATE_KERNEL("compiler_if_else");
+
+    /* First test for clGetProgramInfo. We just have 1 devices now */
+    expect_value = 2;//One program, one kernel.
+    maps.insert(make_pair(CL_PROGRAM_REFERENCE_COUNT,
+                          (void *)(new Info_Result<>(((cl_uint)expect_value)))));
+    maps.insert(make_pair(CL_PROGRAM_CONTEXT,
+                          (void *)(new Info_Result<cl_context>(ctx))));
+    expect_value = 1;
+    maps.insert(make_pair(CL_PROGRAM_NUM_DEVICES,
+                          (void *)(new Info_Result<>(((cl_uint)expect_value)))));
+    maps.insert(make_pair(CL_PROGRAM_DEVICES,
+                          (void *)(new Info_Result<cl_device_id>(device))));
+    sz = (strlen(expect_source) + 1);
+    maps.insert(make_pair(CL_PROGRAM_SOURCE,
+                          (void *)(new Info_Result<char *>(expect_source, sz))));
+    expect_value = NO_STANDARD_REF;
+    maps.insert(make_pair(CL_PROGRAM_BINARY_SIZES,
+                          (void *)(new Info_Result<size_t>((size_t)expect_value))));
+    sz = 4096; //big enough?
+    expect_source = NULL;
+    maps.insert(make_pair(CL_PROGRAM_BINARIES,
+                          (void *)(new Info_Result<char **>(&expect_source, &sz, 1))));
+
+    std::for_each(maps.begin(), maps.end(), [](pair<cl_program_info, void *> x) {
+        switch (x.first) {
+        case CL_PROGRAM_REFERENCE_COUNT:
+        case CL_PROGRAM_NUM_DEVICES:
+            CALL_PROGINFO_AND_RET(cl_uint);
+            break;
+        case CL_PROGRAM_CONTEXT:
+            CALL_PROGINFO_AND_RET(cl_context);
+            break;
+        case CL_PROGRAM_DEVICES:
+            CALL_PROGINFO_AND_RET(cl_device_id);
+            break;
+        case CL_PROGRAM_SOURCE:
+            CALL_PROGINFO_AND_RET(char *);
+            break;
+        case CL_PROGRAM_BINARY_SIZES:
+            CALL_PROGINFO_AND_RET(size_t);
+            break;
+        case CL_PROGRAM_BINARIES:
+            CALL_PROGINFO_AND_RET(char **);
+            break;
+        default:
+            break;
+        }
+    });
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_program_info);
+
+/* ***************************************************** *
+ * clGetCommandQueueInfo                                 *
+ * ***************************************************** */
+#define CALL_QUEUEINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetCommandQueueInfo, queue)
+
+void get_queue_info(void)
+{
+    /* use the compiler_fabs case to test us. */
+    const size_t n = 16;
+    map<cl_program_info, void *> maps;
+    int expect_ref;
+    cl_command_queue_properties prop;
+
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+    OCL_CREATE_KERNEL("compiler_fabs");
+
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+    globals[0] = 16;
+    locals[0] = 16;
+
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+        ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    /* Do our test.*/
+    maps.insert(make_pair(CL_QUEUE_CONTEXT,
+                          (void *)(new Info_Result<cl_context>(ctx))));
+    maps.insert(make_pair(CL_QUEUE_DEVICE,
+                          (void *)(new Info_Result<cl_device_id>(device))));
+
+    expect_ref = 1;
+    maps.insert(make_pair(CL_QUEUE_REFERENCE_COUNT,
+                          (void *)(new Info_Result<>(((cl_uint)expect_ref)))));
+
+    prop = 0;
+    maps.insert(make_pair(CL_QUEUE_PROPERTIES,
+                          (void *)(new Info_Result<cl_command_queue_properties>(
+                                       ((cl_command_queue_properties)prop)))));
+
+    std::for_each(maps.begin(), maps.end(), [](pair<cl_program_info, void *> x) {
+        switch (x.first) {
+        case CL_QUEUE_CONTEXT:
+            CALL_QUEUEINFO_AND_RET(cl_context);
+            break;
+        case CL_QUEUE_DEVICE:
+            CALL_QUEUEINFO_AND_RET(cl_device_id);
+            break;
+        case CL_QUEUE_REFERENCE_COUNT:
+            CALL_QUEUEINFO_AND_RET(cl_uint);
+            break;
+        case CL_QUEUE_PROPERTIES:
+            CALL_QUEUEINFO_AND_RET(cl_command_queue_properties);
+            break;
+        default:
+            break;
+        }
+    });
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_queue_info);
+
+/* ***************************************************** *
+ * clGetProgramBuildInfo                                 *
+ * ***************************************************** */
+#define CALL_PROG_BUILD_INFO_AND_RET(TYPE)  CALL_INFO_AND_RET(TYPE, \
+             clGetProgramBuildInfo, program, device)
+
+void get_program_build_info(void)
+{
+    map<cl_program_info, void *> maps;
+    cl_build_status expect_status;
+    char build_opt[] = "-emit-llvm";
+    char log[] = "";
+    int sz;
+
+    OCL_CALL (cl_kernel_init, "compiler_if_else.cl", "compiler_if_else", SOURCE, build_opt);
+
+    /* Do our test.*/
+    expect_status = CL_BUILD_SUCCESS;
+    maps.insert(make_pair(CL_PROGRAM_BUILD_STATUS,
+                          (void *)(new Info_Result<cl_build_status>(expect_status))));
+    sz = strlen(build_opt) + 1;
+    maps.insert(make_pair(CL_PROGRAM_BUILD_OPTIONS,
+                          (void *)(new Info_Result<char *>(build_opt, sz))));
+    sz = strlen(log) + 1;
+    maps.insert(make_pair(CL_PROGRAM_BUILD_LOG, /* not supported now, just "" */
+                          (void *)(new Info_Result<char *>(log, sz))));
+
+    std::for_each(maps.begin(), maps.end(), [](pair<cl_program_info, void *> x) {
+        switch (x.first) {
+        case CL_PROGRAM_BUILD_STATUS:
+            CALL_PROG_BUILD_INFO_AND_RET(cl_build_status);
+            break;
+        case CL_PROGRAM_BUILD_OPTIONS:
+            CALL_PROG_BUILD_INFO_AND_RET(char *);
+            break;
+        case CL_PROGRAM_BUILD_LOG:
+            CALL_PROG_BUILD_INFO_AND_RET(char *);
+            break;
+        default:
+            break;
+        }
+    });
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_program_build_info);
+
+/* ***************************************************** *
+ * clGetContextInfo                                      *
+ * ***************************************************** */
+#define CALL_CONTEXTINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetContextInfo, ctx)
+
+void get_context_info(void)
+{
+    /* use the compiler_fabs case to test us. */
+    const size_t n = 16;
+    map<cl_context_info, void *> maps;
+    int expect_ref;
+
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+    OCL_CREATE_KERNEL("compiler_fabs");
+
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+    globals[0] = 16;
+    locals[0] = 16;
+
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+        ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    /* Do our test.*/
+    expect_ref = 1;
+    maps.insert(make_pair(CL_CONTEXT_NUM_DEVICES,
+                          (void *)(new Info_Result<cl_uint>(expect_ref))));
+    maps.insert(make_pair(CL_CONTEXT_DEVICES,
+                          (void *)(new Info_Result<cl_device_id>(device))));
+    // reference count seems depends on the implementation
+    expect_ref = NO_STANDARD_REF;
+    maps.insert(make_pair(CL_CONTEXT_REFERENCE_COUNT,
+                          (void *)(new Info_Result<>(((cl_uint)expect_ref)))));
+
+    maps.insert(make_pair(CL_CONTEXT_PROPERTIES,
+                          (void *)(new Info_Result<char*>(
+                                       (const char*)NULL, 100*sizeof(cl_context_properties)))));
+
+    std::for_each(maps.begin(), maps.end(), [](pair<cl_context_info, void *> x) {
+        switch (x.first) {
+        case CL_CONTEXT_NUM_DEVICES:
+            CALL_CONTEXTINFO_AND_RET(cl_uint);
+            break;
+        case CL_CONTEXT_DEVICES:
+            CALL_CONTEXTINFO_AND_RET(cl_device_id);
+            break;
+        case CL_CONTEXT_REFERENCE_COUNT:
+            CALL_CONTEXTINFO_AND_RET(cl_uint);
+            break;
+        case CL_CONTEXT_PROPERTIES:
+            CALL_CONTEXTINFO_AND_RET(char*);
+            break;
+        default:
+            break;
+        }
+    });
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_context_info);
+
+/* ***************************************************** *
+ * clGetKernelInfo                                      *
+ * ***************************************************** */
+#define CALL_KERNELINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetKernelInfo, kernel)
+
+void get_kernel_info(void)
+{
+    /* use the compiler_fabs case to test us. */
+    const size_t n = 16;
+    map<cl_kernel_info, void *> maps;
+    int expect_ref;
+
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+    OCL_CREATE_KERNEL("compiler_fabs");
+
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+    // Run the kernel on GPU
+
+    maps.insert(make_pair(CL_KERNEL_PROGRAM,
+                          (void *)(new Info_Result<cl_program>(program))));
+    maps.insert(make_pair(CL_KERNEL_CONTEXT,
+                          (void *)(new Info_Result<cl_context>(ctx))));
+    // reference count seems depends on the implementation
+    expect_ref = NO_STANDARD_REF;
+    maps.insert(make_pair(CL_KERNEL_REFERENCE_COUNT,
+                          (void *)(new Info_Result<>(((cl_uint)expect_ref)))));
+
+    expect_ref = 2;
+    maps.insert(make_pair(CL_KERNEL_NUM_ARGS,
+                          (void *)(new Info_Result<cl_uint>(expect_ref))));
+
+    const char * expected_name = "compiler_fabs";
+    maps.insert(make_pair(CL_KERNEL_FUNCTION_NAME,
+                          (void *)(new Info_Result<char*>(expected_name, strlen(expected_name)+1))));
+
+    std::for_each(maps.begin(), maps.end(), [](pair<cl_kernel_info, void *> x) {
+        switch (x.first) {
+        case CL_KERNEL_PROGRAM:
+            CALL_KERNELINFO_AND_RET(cl_program);
+            break;
+        case CL_KERNEL_CONTEXT:
+            CALL_KERNELINFO_AND_RET(cl_context);
+            break;
+        case CL_KERNEL_REFERENCE_COUNT:
+            CALL_KERNELINFO_AND_RET(cl_uint);
+            break;
+        case CL_KERNEL_NUM_ARGS:
+            CALL_KERNELINFO_AND_RET(cl_uint);
+            break;
+        case CL_KERNEL_FUNCTION_NAME:
+            CALL_KERNELINFO_AND_RET(char*);
+            break;
+        default:
+            break;
+        }
+    });
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_kernel_info);
+
+/* ***************************************************** *
+ * clGetImageInfo                                        *
+ * ***************************************************** */
+void get_image_info(void)
+{
+  const size_t w = 512;
+  const size_t h = 512;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  desc.image_row_pitch = 0;
+  desc.image_slice_pitch = 0;
+  desc.num_mip_levels = 0;
+  desc.num_samples = 0;
+  desc.buffer = NULL;
+
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+  cl_mem image = buf[0];
+
+  cl_image_format ret_format;
+  OCL_CALL(clGetImageInfo, image, CL_IMAGE_FORMAT, sizeof(ret_format), &ret_format, NULL);
+  OCL_ASSERT(format.image_channel_order == ret_format.image_channel_order);
+  OCL_ASSERT(format.image_channel_data_type == ret_format.image_channel_data_type);
+
+  size_t element_size;
+  OCL_CALL(clGetImageInfo, image, CL_IMAGE_ELEMENT_SIZE, sizeof(element_size), &element_size, NULL);
+  OCL_ASSERT(element_size == 4);
+
+  size_t row_pitch;
+  OCL_CALL(clGetImageInfo, image, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);
+  OCL_ASSERT(row_pitch == 4 * w);
+
+  size_t slice_pitch;
+  OCL_CALL(clGetImageInfo, image, CL_IMAGE_SLICE_PITCH, sizeof(slice_pitch), &slice_pitch, NULL);
+  OCL_ASSERT(slice_pitch == 0);
+
+  size_t width;
+  OCL_CALL(clGetImageInfo, image, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
+  OCL_ASSERT(width == w);
+
+  size_t height;
+  OCL_CALL(clGetImageInfo, image, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
+  OCL_ASSERT(height == h);
+
+  size_t depth;
+  OCL_CALL(clGetImageInfo, image, CL_IMAGE_DEPTH, sizeof(depth), &depth, NULL);
+  OCL_ASSERT(depth == 0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_image_info);
+
+/* ***************************************************** *
+ * clGetMemObjectInfo                                    *
+ * ***************************************************** */
+#define CALL_GETMEMINFO_AND_RET(TYPE) CALL_INFO_AND_RET(TYPE, clGetMemObjectInfo, (buf[0]))
+
+void get_mem_info(void)
+{
+    map<cl_mem_info, void *> maps;
+    int expect_ref;
+    cl_mem sub_buf;
+    cl_int error;
+
+    OCL_CREATE_BUFFER(buf[1], 0, 4096, NULL);
+
+    cl_buffer_region region;
+    region.origin = 1024;
+    region.size = 2048;
+    sub_buf = clCreateSubBuffer(buf[1], 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &error );
+    buf[0] = sub_buf;
+    OCL_ASSERT(error == CL_SUCCESS);
+
+    void * map_ptr = clEnqueueMapBuffer(queue, buf[0], 1, CL_MAP_READ, 0, 64, 0, NULL, NULL, NULL);
+
+    expect_ref = CL_MEM_OBJECT_BUFFER;
+    maps.insert(make_pair(CL_MEM_TYPE,
+                          (void *)(new Info_Result<cl_mem_object_type>((cl_mem_object_type)expect_ref))));
+    expect_ref = 0;
+    maps.insert(make_pair(CL_MEM_FLAGS,
+                          (void *)(new Info_Result<cl_mem_flags>(expect_ref))));
+    expect_ref = 2048;
+    maps.insert(make_pair(CL_MEM_SIZE,
+                          (void *)(new Info_Result<size_t>(((size_t)expect_ref)))));
+    expect_ref = 1024;
+    maps.insert(make_pair(CL_MEM_HOST_PTR,
+                          (void *)(new Info_Result<size_t>(((size_t)expect_ref)))));
+    expect_ref = 1;
+    maps.insert(make_pair(CL_MEM_MAP_COUNT,
+                          (void *)(new Info_Result<cl_uint>(((cl_uint)expect_ref)))));
+    expect_ref = 1;
+    maps.insert(make_pair(CL_MEM_REFERENCE_COUNT,
+                          (void *)(new Info_Result<cl_uint>(((cl_uint)expect_ref)))));
+    maps.insert(make_pair(CL_MEM_CONTEXT,
+                          (void *)(new Info_Result<cl_context>(((cl_context)ctx)))));
+    maps.insert(make_pair(CL_MEM_ASSOCIATED_MEMOBJECT,
+                          (void *)(new Info_Result<cl_mem>(((cl_mem)buf[1])))));
+    expect_ref = 1024;
+    maps.insert(make_pair(CL_MEM_OFFSET,
+                          (void *)(new Info_Result<size_t>(((size_t)expect_ref)))));
+
+    std::for_each(maps.begin(), maps.end(), [](pair<cl_mem_info, void *> x) {
+        switch (x.first) {
+        case CL_MEM_TYPE:
+            CALL_GETMEMINFO_AND_RET(cl_mem_object_type);
+            break;
+        case CL_MEM_FLAGS:
+            CALL_GETMEMINFO_AND_RET(cl_mem_flags);
+            break;
+        case CL_MEM_SIZE:
+            CALL_GETMEMINFO_AND_RET(size_t);
+            break;
+        case CL_MEM_HOST_PTR:
+            CALL_GETMEMINFO_AND_RET(size_t);
+            break;
+        case CL_MEM_MAP_COUNT:
+            CALL_GETMEMINFO_AND_RET(cl_uint);
+            break;
+        case CL_MEM_REFERENCE_COUNT:
+            CALL_GETMEMINFO_AND_RET(cl_uint);
+            break;
+        case CL_MEM_CONTEXT:
+            CALL_GETMEMINFO_AND_RET(cl_context);
+            break;
+        case CL_MEM_ASSOCIATED_MEMOBJECT:
+            CALL_GETMEMINFO_AND_RET(cl_mem);
+            break;
+        case CL_MEM_OFFSET:
+            CALL_GETMEMINFO_AND_RET(size_t);
+            break;
+
+        default:
+            break;
+        }
+    });
+
+    clEnqueueUnmapMemObject(queue, buf[0], map_ptr, 0, NULL, NULL);
+}
+
+MAKE_UTEST_FROM_FUNCTION(get_mem_info);
diff --git a/utests/image_1D_buffer.cpp b/utests/image_1D_buffer.cpp
new file mode 100644
index 0000000..d8d761f
--- /dev/null
+++ b/utests/image_1D_buffer.cpp
@@ -0,0 +1,80 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+void image_1D_buffer(void)
+{
+  size_t buffer_sz = 1024;
+  char *buf_content = (char *)malloc(buffer_sz * sizeof(char));
+  int error;
+  cl_image_desc image_desc;
+  cl_image_format image_format;
+  cl_sampler sampler;
+  cl_mem image1, image2;
+  cl_mem ret_mem = NULL;
+
+  OCL_CREATE_KERNEL("image_1D_buffer");
+
+  for (int32_t i = 0; i < (int32_t)buffer_sz; ++i)
+    buf_content[i] = (rand() & 127);
+
+  cl_mem buff = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                      buffer_sz, buf_content, &error);
+  OCL_ASSERT(error == CL_SUCCESS);
+
+  memset(&image_desc, 0x0, sizeof(cl_image_desc));
+  memset(&image_format, 0x0, sizeof(cl_image_format));
+
+  image_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+  image_desc.image_row_pitch = buffer_sz;
+  image_desc.image_width = buffer_sz / sizeof(uint32_t); //assume rgba32
+  image_desc.buffer = buff;
+
+  image_format.image_channel_order = CL_RGBA;
+  image_format.image_channel_data_type = CL_UNSIGNED_INT8;
+
+  image1 = clCreateImage(ctx, CL_MEM_READ_ONLY, &image_format,
+                        &image_desc, NULL, &error );
+  OCL_ASSERT(error == CL_SUCCESS);
+
+  error = clGetImageInfo(image1, CL_IMAGE_BUFFER, sizeof(ret_mem), &ret_mem, NULL);
+  OCL_ASSERT(error == CL_SUCCESS);
+  OCL_ASSERT(ret_mem == buff);
+
+
+  memset(&image_desc, 0x0, sizeof(cl_image_desc));
+  image_desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+  image_desc.image_width = buffer_sz / sizeof(uint32_t);
+  image2 = clCreateImage(ctx, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
+                         &image_format, &image_desc, buf_content, &error);
+  OCL_ASSERT(error == CL_SUCCESS);
+
+  // Create sampler to use
+  sampler = clCreateSampler(ctx, false, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error );
+  OCL_ASSERT(error == CL_SUCCESS);
+
+  cl_mem result_buf = buf[0] = clCreateBuffer(ctx, 0, buffer_sz, NULL, &error);
+  OCL_ASSERT(error == CL_SUCCESS);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &image1);
+  OCL_SET_ARG(1, sizeof(cl_mem), &image2);
+  OCL_SET_ARG(2, sizeof(sampler), &sampler);
+  OCL_SET_ARG(3, sizeof(cl_mem), &result_buf);
+
+  globals[0] = buffer_sz/sizeof(int32_t);
+  locals[0] = 16;
+
+  OCL_NDRANGE(1);
+
+  /* Now check the result. */
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < buffer_sz/sizeof(int32_t); i++)
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == 1);
+  OCL_UNMAP_BUFFER(0);
+
+  clReleaseSampler(sampler);
+  clReleaseMemObject(image1);
+  clReleaseMemObject(image2);
+  clReleaseMemObject(buff);
+}
+
+MAKE_UTEST_FROM_FUNCTION(image_1D_buffer);
diff --git a/utests/load_program_from_bin_file.cpp b/utests/load_program_from_bin_file.cpp
new file mode 100644
index 0000000..feefacc
--- /dev/null
+++ b/utests/load_program_from_bin_file.cpp
@@ -0,0 +1,77 @@
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+#include <cmath>
+#include <algorithm>
+
+using namespace std;
+
+static void cpu(int global_id, float *src, float *dst) {
+    dst[global_id] = ceilf(src[global_id]);
+}
+
+static void test_load_program_from_bin_file(void)
+{
+    const size_t n = 16;
+    float cpu_dst[16], cpu_src[16];
+    cl_int status;
+    cl_int binary_status;
+    char *ker_path = NULL;
+
+    cl_file_map_t *fm = cl_file_map_new();
+    ker_path = cl_do_kiss_path("compiler_ceil.bin", device);
+    OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
+
+    const unsigned char *src = (const unsigned char *)cl_file_map_begin(fm);
+    const size_t sz = cl_file_map_size(fm);
+
+    program = clCreateProgramWithBinary(ctx, 1,
+              &device, &sz, &src, &binary_status, &status);
+
+    OCL_ASSERT(program && status == CL_SUCCESS);
+
+    /* OCL requires to build the program even if it is created from a binary */
+    OCL_ASSERT(clBuildProgram(program, 1, &device, NULL, NULL, NULL) == CL_SUCCESS);
+
+    kernel = clCreateKernel(program, "compiler_ceil", &status);
+    OCL_ASSERT(status == CL_SUCCESS);
+
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    globals[0] = 16;
+    locals[0] = 16;
+
+    // Run random tests
+    for (uint32_t pass = 0; pass < 8; ++pass) {
+        OCL_MAP_BUFFER(0);
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+        OCL_UNMAP_BUFFER(0);
+
+        // Run the kernel on GPU
+        OCL_NDRANGE(1);
+
+        // Run on CPU
+        for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+        // Compare
+        OCL_MAP_BUFFER(1);
+
+#if 0
+        printf("#### GPU:\n");
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            printf(" %f", ((float *)buf_data[1])[i]);
+        printf("\n#### CPU:\n");
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            printf(" %f", cpu_dst[i]);
+        printf("\n");
+#endif
+
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+        OCL_UNMAP_BUFFER(1);
+    }
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_load_program_from_bin_file);
diff --git a/utests/load_program_from_gen_bin.cpp b/utests/load_program_from_gen_bin.cpp
new file mode 100644
index 0000000..3db13b2
--- /dev/null
+++ b/utests/load_program_from_gen_bin.cpp
@@ -0,0 +1,93 @@
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+#include <cmath>
+#include <algorithm>
+
+using namespace std;
+
+static void cpu(int global_id, float *src, float *dst) {
+    dst[global_id] = ceilf(src[global_id]);
+}
+
+static void test_load_program_from_gen_bin(void)
+{
+    const size_t n = 16;
+    float cpu_dst[16], cpu_src[16];
+    cl_int status;
+    cl_int binary_status;
+    char *ker_path = NULL;
+
+    cl_file_map_t *fm = cl_file_map_new();
+    ker_path = cl_do_kiss_path("compiler_ceil.cl", device);
+    OCL_ASSERT (cl_file_map_open(fm, ker_path) == CL_FILE_MAP_SUCCESS);
+
+    const char *src = (const char *)cl_file_map_begin(fm);
+
+    program =clCreateProgramWithSource(ctx, 1, &src, NULL, &status);
+
+    OCL_ASSERT(program && status == CL_SUCCESS);
+
+    /* OCL requires to build the program even if it is created from a binary */
+    OCL_ASSERT(clBuildProgram(program, 1, &device, NULL, NULL, NULL) == CL_SUCCESS);
+
+    size_t      binarySize;
+    unsigned char *binary = NULL;
+
+    status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof( binarySize ), &binarySize, NULL );
+    OCL_ASSERT(status == CL_SUCCESS);
+    // Create a buffer and get the gen binary
+    binary = (unsigned char*)malloc(sizeof(unsigned char)*binarySize);
+    OCL_ASSERT(binary != NULL);
+
+    status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof( &binary), &binary, NULL );
+    OCL_ASSERT(status == CL_SUCCESS);
+
+    cl_program bin_program = clCreateProgramWithBinary(ctx, 1,
+              &device, &binarySize, (const unsigned char**)&binary, &binary_status, &status);
+    OCL_ASSERT(bin_program && status == CL_SUCCESS);
+    /* OCL requires to build the program even if it is created from a binary */
+    OCL_ASSERT(clBuildProgram(bin_program, 1, &device, NULL, NULL, NULL) == CL_SUCCESS);
+
+    kernel = clCreateKernel(bin_program, "compiler_ceil", &status);
+    OCL_ASSERT(status == CL_SUCCESS);
+
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    globals[0] = 16;
+    locals[0] = 16;
+
+    // Run random tests
+    for (uint32_t pass = 0; pass < 8; ++pass) {
+        OCL_MAP_BUFFER(0);
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+        OCL_UNMAP_BUFFER(0);
+
+        // Run the kernel on GPU
+        OCL_NDRANGE(1);
+
+        // Run on CPU
+        for (int32_t i = 0; i < (int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+        // Compare
+        OCL_MAP_BUFFER(1);
+
+#if 0
+        printf("#### GPU:\n");
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            printf(" %f", ((float *)buf_data[1])[i]);
+        printf("\n#### CPU:\n");
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            printf(" %f", cpu_dst[i]);
+        printf("\n");
+#endif
+
+        for (int32_t i = 0; i < (int32_t) n; ++i)
+            OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+        OCL_UNMAP_BUFFER(1);
+    }
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_load_program_from_gen_bin);
diff --git a/utests/my_test.cpp b/utests/my_test.cpp
new file mode 100644
index 0000000..73a4718
--- /dev/null
+++ b/utests/my_test.cpp
@@ -0,0 +1,99 @@
+#include "utest_helper.hpp"
+
+struct seg {
+	unsigned int end, color, offset;
+	seg(int e, int c):end(e), color(c) {}
+};
+typedef struct seg seg;
+
+typedef struct {
+	std::vector<seg> segs;
+} rle_data;
+
+struct rle_image {
+	int width, height;
+	std::vector<rle_data> data;
+	rle_image(int w, int h):width(w), height(h) {}
+};
+typedef struct rle_image rle_image;
+
+static  void read_data(const char *filename, rle_image &image)
+{
+	FILE *fp;
+	char line[4096];
+	int i;
+	fp = fopen(filename, "r");
+	for (i = 0; i < image.height; i++) {
+		char *nptr = line, *endptr;
+		rle_data d;
+		int start = 0;
+		if (fgets(line, sizeof(line), fp) == NULL)
+			break;
+		for (;;) {
+			int len = strtol(nptr, &endptr, 10);
+			nptr = endptr;
+			int color = strtol(nptr, &endptr, 10);
+			nptr = endptr;
+			seg s(start + len, color);
+			d.segs.push_back(s);
+			if (*endptr == '\n' || *endptr == 0)
+				break;
+			start += len;
+		}
+		image.data.push_back(d);
+	}
+	fclose(fp);
+}
+
+static void prepare_rle_buffer(rle_image &image, std::vector<int> &rle_buffer, int *offsets)
+{
+	int offset = 0;
+	for (int i = 0; i < image.height; i++) {
+		unsigned int j;
+		rle_data d = image.data[i];
+		for (j = 0; j < d.segs.size(); j++) {
+			rle_buffer.push_back(d.segs[j].end);
+			rle_buffer.push_back(d.segs[j].color);
+		}
+		offsets[i] = offset;
+		offset += j;
+	}
+
+}
+
+static void expand_rle(rle_image &image)
+{
+	std::vector<int> rle_buffer;
+	int offsets[image.height];
+	int w = image.width/16;
+	prepare_rle_buffer(image, rle_buffer, offsets);
+	OCL_CREATE_KERNEL("my_test");
+	OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, 2*sizeof(int)*rle_buffer.size(), &rle_buffer[0]);
+	OCL_CREATE_BUFFER(buf[1], CL_MEM_COPY_HOST_PTR, sizeof(int)*image.height, offsets);
+	OCL_CREATE_BUFFER(buf[2], 0, image.width*image.height, NULL);
+	OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+	OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+	OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+	OCL_SET_ARG(3, sizeof(w), &w);
+
+	globals[0] = image.height;
+	locals[0] = 16;
+	OCL_NDRANGE(1);
+#if 1
+	OCL_MAP_BUFFER(2);
+	for (int i = 0; i < image.height; i++) {
+		for (int j = 0; j < image.width; j++)
+			printf("%d ", ((unsigned char*)buf_data[2])[i*image.width+j]);
+		printf("\n****\n");
+	}
+	OCL_UNMAP_BUFFER(2);
+#endif
+}
+
+static void my_test(void)
+{
+	rle_image image(256, 256);
+	read_data("new_data.txt", image);
+	expand_rle(image);
+}
+MAKE_UTEST_FROM_FUNCTION(my_test);
diff --git a/utests/new_data.txt b/utests/new_data.txt
new file mode 100644
index 0000000..b12bb13
--- /dev/null
+++ b/utests/new_data.txt
@@ -0,0 +1,256 @@
+6 5 3 4 37 15 10 2 200 3
+156 1 97 200 3 3
+2 1 2 10 128 2 124 25
+5 5 251 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 1
+256 2
+256 3
+256 0
+256 0
+256 0
+256 1
+256 2
+256 3
+256 0
+256 0
+256 0
+256 0
+256 0
+256 4
+256 5
+256 6
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 3
+100 255 100 155 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 0 100 255 56 0
+100 253 100 255 56 0
+56 0 20 8 180 9
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+256 0
+1 253 5 252 150 168 100 254
+150 168 100 254 1 253 5 252
diff --git a/utests/profiling_exec.cpp b/utests/profiling_exec.cpp
new file mode 100644
index 0000000..afa55ba
--- /dev/null
+++ b/utests/profiling_exec.cpp
@@ -0,0 +1,102 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+static void cpu_exec (int n, float* src, float* dst)
+{
+    int i = 0;
+    for (; i < n; i++) {
+	float f = src[i];
+	f = f < 0 ? -f : f;
+	dst[i] = f;
+    }
+}
+
+#define QUEUE_SECONDS_LIMIT 10
+#define SUBMIT_SECONDS_LIMIT 20
+#define COMMAND_SECONDS_LIMIT 10
+
+static void check_profiling_time(cl_ulong queued, cl_ulong submit, cl_ulong start, cl_ulong end)
+{
+    size_t profiling_resolution = 0;
+    OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_PROFILING_TIMER_RESOLUTION,
+             sizeof(profiling_resolution), &profiling_resolution, NULL);
+
+    /* Convert the time to second. */
+    double queue_to_submit = (double)(submit - queued)*1e-9;
+    double submit_to_start = (double)(start - submit)*1e-9;
+    double start_to_end = (double)(end - start)*1e-9;
+
+    //printf("Profiling info:\n");
+    //printf("Time from queue to submit : %fms\n", (double)(queue_to_submit) * 1000.f );
+    //printf( "Time from submit to start : %fms\n", (double)(submit_to_start) * 1000.f );
+    //printf( "Time from start to end: %fms\n", (double)(start_to_end) * 1000.f );
+
+    OCL_ASSERTM(queued <= submit, "Enqueue time is later than submit time, invalid\n");
+    OCL_ASSERTM(submit <= start, "Submit time is later than start time, invalid\n");
+    OCL_ASSERTM(start <= end, "Start time is later than end time, invalid\n");
+
+    OCL_ASSERTM(queue_to_submit <= QUEUE_SECONDS_LIMIT, "Too large time from queue to submit\n");
+    OCL_ASSERTM(submit_to_start <= QUEUE_SECONDS_LIMIT, "Too large time from submit to start\n");
+    OCL_ASSERTM(start_to_end <= QUEUE_SECONDS_LIMIT, "Too large time from start to end\n");
+}
+
+static void profiling_exec(void)
+{
+    const size_t n = 512;
+    cl_int status = CL_SUCCESS;
+    cl_command_queue profiling_queue = NULL;
+    cl_command_queue tmp_queue = NULL;
+    float* cpu_src = (float *)malloc(n*sizeof(float));
+    float* cpu_dst = (float *)malloc(n*sizeof(float));
+    cl_event exec_event;
+    cl_ulong time_queue, time_submit, time_start, time_end;
+
+
+    /* Because the profiling prop, we can not use default queue. */
+    profiling_queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &status);
+    OCL_ASSERT(status == CL_SUCCESS);
+
+    /* save the default queue. */
+    tmp_queue = queue;
+    queue = profiling_queue;
+
+    OCL_CREATE_KERNEL("compiler_fabs");
+
+    OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(float), NULL);
+    OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(float), NULL);
+    OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+    OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+    globals[0] = n;
+    locals[0] = 256;
+
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+	cpu_src[i] = ((float*)buf_data[0])[i] = .1f * (rand() & 15) - .75f;
+    OCL_UNMAP_BUFFER(0);
+
+    cpu_exec(n, cpu_src, cpu_dst);
+
+    // Run the kernel on GPU
+    OCL_CALL(clEnqueueNDRangeKernel, queue, kernel, 1, NULL, globals, locals, 0, NULL, &exec_event);
+    OCL_CALL(clWaitForEvents, 1, &exec_event);
+
+    OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &time_queue, NULL);
+    OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &time_submit, NULL);
+    OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &time_start, NULL);
+    OCL_CALL(clGetEventProfilingInfo, exec_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time_end, NULL);
+
+    check_profiling_time(time_queue, time_submit, time_start, time_end);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+	OCL_ASSERT(((float *)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+
+    queue = tmp_queue;
+    clReleaseCommandQueue(profiling_queue);
+    free(cpu_dst);
+    free(cpu_src);
+}
+
+MAKE_UTEST_FROM_FUNCTION(profiling_exec);
diff --git a/utests/runtime_barrier_list.cpp b/utests/runtime_barrier_list.cpp
new file mode 100644
index 0000000..135996f
--- /dev/null
+++ b/utests/runtime_barrier_list.cpp
@@ -0,0 +1,75 @@
+#include "utest_helper.hpp"
+
+#define BUFFERSIZE  32*1024
+void runtime_barrier_list(void)
+{
+  const size_t n = BUFFERSIZE;
+  cl_int cpu_src[BUFFERSIZE];
+  cl_int cpu_src_2[BUFFERSIZE];
+  cl_event ev[5];
+  cl_int status = 0;
+  cl_int value = 34;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_event");
+  OCL_CREATE_BUFFER(buf[0], 0, BUFFERSIZE*sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, BUFFERSIZE*sizeof(int), NULL);
+
+  for(cl_uint i=0; i<BUFFERSIZE; i++)
+  {
+    cpu_src[i] = 3;
+    cpu_src_2[i] = 5;
+  }
+
+  OCL_CREATE_USER_EVENT(ev[0]);
+
+  clEnqueueWriteBuffer(queue, buf[0], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(int), &value);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 32;
+
+  clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, &ev[0], &ev[2]);
+
+  for (cl_uint i = 0; i < 3; ++i) {
+    clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+    OCL_ASSERT(status >= CL_SUBMITTED);
+  }
+
+
+  buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
+
+  clEnqueueBarrierWithWaitList(queue, 0, NULL, &ev[3]);
+
+  clEnqueueWriteBuffer(queue, buf[1], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src_2, 0, NULL, &ev[4]);
+
+  OCL_FINISH();
+  clGetEventInfo(ev[4], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+  OCL_ASSERT(status != CL_COMPLETE);
+
+  OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
+
+  clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+  OCL_ASSERT(status == CL_COMPLETE);
+
+  OCL_FINISH();
+
+  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+    clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+    OCL_ASSERT(status <= CL_COMPLETE);
+  }
+
+  for (uint32_t i = 0; i < n; ++i) {
+    OCL_ASSERT(((int*)buf_data[0])[i] == (int)value + 0x3);
+  }
+  clEnqueueUnmapMemObject(queue, buf[0], buf_data[0], 0, NULL, NULL);
+
+  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+    clReleaseEvent(ev[i]);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_barrier_list);
diff --git a/utests/runtime_compile_link.cpp b/utests/runtime_compile_link.cpp
new file mode 100644
index 0000000..4a39b6a
--- /dev/null
+++ b/utests/runtime_compile_link.cpp
@@ -0,0 +1,162 @@
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include "utest_helper.hpp"
+#include "utest_file_map.hpp"
+
+#define BUFFERSIZE  32*1024
+
+int init_program(const char* name, cl_context ctx, cl_program *pg )
+{
+  cl_int err;
+  char* ker_path = cl_do_kiss_path(name, device);
+
+  cl_file_map_t *fm = cl_file_map_new();
+  err = cl_file_map_open(fm, ker_path);
+  if(err != CL_FILE_MAP_SUCCESS)
+    OCL_ASSERT(0);
+  const char *src = cl_file_map_begin(fm);
+
+  *pg = clCreateProgramWithSource(ctx, 1, &src, NULL, &err);
+  free(ker_path);
+  cl_file_map_delete(fm);
+  return 0;
+
+}
+
+void runtime_compile_link(void)
+{
+
+  cl_int err;
+
+  const char* header_file_name="runtime_compile_link.h";
+  cl_program foo_pg;
+  init_program(header_file_name, ctx, &foo_pg);
+
+  const char* myinc_file_name="include/runtime_compile_link_inc.h";
+  cl_program myinc_pg;
+  init_program(myinc_file_name, ctx, &myinc_pg);
+
+  const char* file_name_A="runtime_compile_link_a.cl";
+  cl_program program_A;
+  init_program(file_name_A, ctx, &program_A);
+
+  cl_program input_headers[2] = { foo_pg, myinc_pg};
+  const char * input_header_names[2] = {header_file_name, myinc_file_name}; 
+
+  err = clCompileProgram(program_A,
+                                0, NULL, // num_devices & device_list
+                                NULL, // compile_options
+                                2, // num_input_headers
+                                input_headers,
+                                input_header_names,
+                                NULL, NULL);
+
+  OCL_ASSERT(err==CL_SUCCESS);
+  const char* file_name_B="runtime_compile_link_b.cl";
+  cl_program program_B;
+  init_program(file_name_B, ctx, &program_B);
+
+  err = clCompileProgram(program_B,
+                                0, NULL, // num_devices & device_list
+                                NULL, // compile_options
+                                2, // num_input_headers
+                                input_headers,
+                                input_header_names,
+                                NULL, NULL);
+
+  OCL_ASSERT(err==CL_SUCCESS);
+  cl_program input_programs[2] = { program_A, program_B};
+  cl_program linked_program = clLinkProgram(ctx, 0, NULL, "-create-library", 2, input_programs, NULL, NULL, &err);
+
+  OCL_ASSERT(linked_program != NULL);
+  OCL_ASSERT(err == CL_SUCCESS);
+  size_t      binarySize;
+  unsigned char *binary;
+
+  // Get the size of the resulting binary (only one device)
+  err= clGetProgramInfo( linked_program, CL_PROGRAM_BINARY_SIZES, sizeof( binarySize ), &binarySize, NULL );
+  OCL_ASSERT(err==CL_SUCCESS);
+
+  // Create a buffer and get the actual binary
+  binary = (unsigned char*)malloc(sizeof(unsigned char)*binarySize);
+  if (binary == NULL) {
+    OCL_ASSERT(0);
+    return ;
+  }
+
+  unsigned char *buffers[ 1 ] = { binary };
+  // Do another sanity check here first
+  size_t size;
+  cl_int loadErrors[ 1 ];
+  err = clGetProgramInfo( linked_program, CL_PROGRAM_BINARIES, 0, NULL, &size );
+  OCL_ASSERT(err==CL_SUCCESS);
+  if( size != sizeof( buffers ) ){
+    free(binary);
+    return ;
+  }
+
+  err = clGetProgramInfo( linked_program, CL_PROGRAM_BINARIES, sizeof( buffers ), &buffers, NULL );
+  OCL_ASSERT(err==CL_SUCCESS);
+
+  cl_device_id deviceID;
+  err = clGetProgramInfo( linked_program, CL_PROGRAM_DEVICES, sizeof( deviceID), &deviceID, NULL );
+  OCL_ASSERT(err==CL_SUCCESS);
+
+  cl_program program_with_binary = clCreateProgramWithBinary(ctx, 1, &deviceID, &binarySize, (const unsigned char**)buffers, loadErrors, &err);
+  OCL_ASSERT(err==CL_SUCCESS);
+
+  cl_program new_linked_program = clLinkProgram(ctx, 1, &deviceID, NULL, 1, &program_with_binary, NULL, NULL, &err);
+  OCL_ASSERT(err==CL_SUCCESS);
+  // link success, run this kernel.
+
+  const size_t n = 16;
+  int64_t src1[n], src2[n];
+
+  src1[0] = (int64_t)1 << 63, src2[0] = 0x7FFFFFFFFFFFFFFFll;
+  src1[1] = (int64_t)1 << 63, src2[1] = ((int64_t)1 << 63) | 1;
+  src1[2] = -1ll, src2[2] = 0;
+  src1[3] = ((int64_t)123 << 32) | 0x7FFFFFFF, src2[3] = ((int64_t)123 << 32) | 0x80000000;
+  src1[4] = 0x7FFFFFFFFFFFFFFFll, src2[4] = (int64_t)1 << 63;
+  src1[5] = ((int64_t)1 << 63) | 1, src2[5] = (int64_t)1 << 63;
+  src1[6] = 0, src2[6] = -1ll;
+  src1[7] = ((int64_t)123 << 32) | 0x80000000, src2[7] = ((int64_t)123 << 32) | 0x7FFFFFFF;
+  for(size_t i=8; i<n; i++) {
+    src1[i] = i;
+    src2[i] = i;
+  }
+
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(int64_t), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(int64_t), NULL);
+  OCL_MAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  memcpy(buf_data[0], src1, sizeof(src1));
+  memcpy(buf_data[1], src2, sizeof(src2));
+  OCL_UNMAP_BUFFER(0);
+  OCL_UNMAP_BUFFER(1);
+
+  kernel = clCreateKernel(new_linked_program, "runtime_compile_link_a", &err);
+
+  OCL_ASSERT(err == CL_SUCCESS);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+  clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 0, NULL, NULL);
+
+  OCL_MAP_BUFFER(2);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    int64_t *dest = (int64_t *)buf_data[2];
+    int64_t x = (src1[i] < src2[i]) ? 3 : 4;
+    OCL_ASSERT(x == dest[i]);
+  }
+  OCL_UNMAP_BUFFER(2);
+  OCL_DESTROY_KERNEL_KEEP_PROGRAM(true);
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_compile_link);
diff --git a/utests/runtime_createcontext.cpp b/utests/runtime_createcontext.cpp
new file mode 100644
index 0000000..f08a189
--- /dev/null
+++ b/utests/runtime_createcontext.cpp
@@ -0,0 +1,14 @@
+#include "utest_helper.hpp"
+
+void runtime_createcontextfromtype(void) {
+  cl_int status;
+
+  cl_context ctx;
+  ctx = clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
+  if (ctx == NULL) {
+    OCL_THROW_ERROR("runtime_createcontextfromtype", status);
+  }
+  clReleaseContext(ctx);
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_createcontextfromtype);
diff --git a/utests/runtime_event.cpp b/utests/runtime_event.cpp
new file mode 100644
index 0000000..f8170a3
--- /dev/null
+++ b/utests/runtime_event.cpp
@@ -0,0 +1,60 @@
+#include "utest_helper.hpp"
+
+#define BUFFERSIZE  32*1024
+void runtime_event(void)
+{
+  const size_t n = BUFFERSIZE;
+  cl_int cpu_src[BUFFERSIZE];
+  cl_event ev[3];
+  cl_int status = 0;
+  cl_int value = 34;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_event");
+  OCL_CREATE_BUFFER(buf[0], 0, BUFFERSIZE*sizeof(int), NULL);
+
+  for(cl_uint i=0; i<BUFFERSIZE; i++)
+    cpu_src[i] = 3;
+
+  OCL_CREATE_USER_EVENT(ev[0]);
+
+  clEnqueueWriteBuffer(queue, buf[0], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(int), &value);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 32;
+  clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, &ev[0], &ev[2]);
+
+  for (cl_uint i = 0; i < 3; ++i) {
+    clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+    OCL_ASSERT(status >= CL_SUBMITTED);
+  }
+
+  buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
+
+  OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
+
+  clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+  OCL_ASSERT(status == CL_COMPLETE);
+
+  OCL_FINISH();
+
+  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+    clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+    OCL_ASSERT(status <= CL_COMPLETE);
+  }
+
+  for (uint32_t i = 0; i < n; ++i) {
+    OCL_ASSERT(((int*)buf_data[0])[i] == (int)value + 0x3);
+  }
+  clEnqueueUnmapMemObject(queue, buf[0], buf_data[0], 0, NULL, NULL);
+
+  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+    clReleaseEvent(ev[i]);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_event);
diff --git a/utests/runtime_flat_address_space.cpp b/utests/runtime_flat_address_space.cpp
new file mode 100644
index 0000000..08167c4
--- /dev/null
+++ b/utests/runtime_flat_address_space.cpp
@@ -0,0 +1,75 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+int
+main(int argc, char *argv[])
+{
+  cl_mem dst[24];
+  int *dst_buffer = NULL;
+  const size_t n = 32 * 1024 * 1024;
+  const size_t global_work_size = n;
+  const size_t local_work_size = 16;
+  int status = 0;
+
+  if ((status = cl_test_init("test_write_only.cl", "test_write_only", SOURCE)) != 0)
+    goto error;
+
+  for (uint32_t j = 0; j < 24; ++j)
+  {
+    // Allocate the two buffers
+    dst[j] = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+    if (status != CL_SUCCESS) goto error;
+
+    // Set source and destination
+    OCL_CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &dst[j]);
+
+    // Run the kernel
+    OCL_CALL (clEnqueueNDRangeKernel, queue,
+                                  kernel,
+                                  1,
+                                  NULL,
+                                  &global_work_size,
+                                  &local_work_size,
+                                  0,
+                                  NULL,
+                                  NULL);
+
+    // Be sure that everything run fine
+    dst_buffer = (int *) clMapBufferIntel(dst[j], &status);
+    if (status != CL_SUCCESS)
+      goto error;
+    for (uint32_t i = 0; i < n; ++i)
+      if (dst_buffer[i] != int(i)) {
+        fprintf(stderr, "run-time flat address space failed\n");
+        exit(-1);
+      }
+    OCL_CALL (clUnmapBufferIntel, dst[j]);
+  }
+
+  for (uint32_t j = 0; j < 24; ++j) OCL_CALL (clReleaseMemObject, dst[j]);
+  cl_test_destroy();
+  printf("%i memory leaks\n", clReportUnfreedIntel());
+  assert(clReportUnfreedIntel() == 0);
+
+error:
+  return status;
+}
+
diff --git a/utests/runtime_marker_list.cpp b/utests/runtime_marker_list.cpp
new file mode 100644
index 0000000..f64b1d1
--- /dev/null
+++ b/utests/runtime_marker_list.cpp
@@ -0,0 +1,75 @@
+#include "utest_helper.hpp"
+
+#define BUFFERSIZE  32*1024
+void runtime_marker_list(void)
+{
+  const size_t n = BUFFERSIZE;
+  cl_int cpu_src[BUFFERSIZE];
+  cl_int cpu_src_2[BUFFERSIZE];
+  cl_event ev[5];
+  cl_int status = 0;
+  cl_int value = 34;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_event");
+  OCL_CREATE_BUFFER(buf[0], 0, BUFFERSIZE*sizeof(int), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, BUFFERSIZE*sizeof(int), NULL);
+
+  for(cl_uint i=0; i<BUFFERSIZE; i++)
+  {
+    cpu_src[i] = 3;
+    cpu_src_2[i] = 5;
+  }
+
+  OCL_CREATE_USER_EVENT(ev[0]);
+
+  clEnqueueWriteBuffer(queue, buf[0], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src, 1, &ev[0], &ev[1]);
+
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(int), &value);
+
+  // Run the kernel
+  globals[0] = n;
+  locals[0] = 32;
+
+  clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globals, locals, 2, &ev[0], &ev[2]);
+
+  for (cl_uint i = 0; i < 3; ++i) {
+    clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+    OCL_ASSERT(status >= CL_SUBMITTED);
+  }
+
+
+  buf_data[0] = clEnqueueMapBuffer(queue, buf[0], CL_TRUE, 0, 0, BUFFERSIZE*sizeof(int), 1, &ev[2], NULL, NULL);
+
+  clEnqueueMarkerWithWaitList(queue, 0, NULL, &ev[3]);
+
+  clEnqueueWriteBuffer(queue, buf[1], CL_TRUE, 0, BUFFERSIZE*sizeof(int), (void *)cpu_src_2, 0, NULL, &ev[4]);
+
+  OCL_FINISH();
+  clGetEventInfo(ev[4], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+  OCL_ASSERT(status == CL_COMPLETE);
+
+  OCL_SET_USER_EVENT_STATUS(ev[0], CL_COMPLETE);
+
+  clGetEventInfo(ev[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+  OCL_ASSERT(status == CL_COMPLETE);
+
+  OCL_FINISH();
+
+  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+    clGetEventInfo(ev[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL);
+    OCL_ASSERT(status <= CL_COMPLETE);
+  }
+
+  for (uint32_t i = 0; i < n; ++i) {
+    OCL_ASSERT(((int*)buf_data[0])[i] == (int)value + 0x3);
+  }
+  clEnqueueUnmapMemObject(queue, buf[0], buf_data[0], 0, NULL, NULL);
+
+  for (cl_uint i = 0; i != sizeof(ev) / sizeof(cl_event); ++i) {
+    clReleaseEvent(ev[i]);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_marker_list);
diff --git a/utests/runtime_null_kernel_arg.cpp b/utests/runtime_null_kernel_arg.cpp
new file mode 100644
index 0000000..447e345
--- /dev/null
+++ b/utests/runtime_null_kernel_arg.cpp
@@ -0,0 +1,27 @@
+#include "utest_helper.hpp"
+
+void runtime_null_kernel_arg(void)
+{
+  const size_t n = 32;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("null_kernel_arg");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), NULL);
+  OCL_SET_ARG(2, sizeof(cl_mem), NULL);
+
+    // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i);
+  OCL_UNMAP_BUFFER(0);
+}
+
+
+MAKE_UTEST_FROM_FUNCTION(runtime_null_kernel_arg);
diff --git a/utests/setenv.sh.in b/utests/setenv.sh.in
new file mode 100644
index 0000000..b0f575f
--- /dev/null
+++ b/utests/setenv.sh.in
@@ -0,0 +1,7 @@
+#!/bin/sh
+#
+export OCL_PCM_PATH=@LOCAL_PCM_OBJECT_DIR@
+export OCL_PCH_PATH=@LOCAL_PCH_OBJECT_DIR@
+export OCL_KERNEL_PATH=@CMAKE_CURRENT_SOURCE_DIR@/../kernels
+export OCL_GBE_PATH=@LOCAL_GBE_OBJECT_DIR@
+export OCL_INTERP_PATH=@LOCAL_INTERP_OBJECT_DIR@
diff --git a/utests/sub_buffer.cpp b/utests/sub_buffer.cpp
new file mode 100644
index 0000000..d32fd65
--- /dev/null
+++ b/utests/sub_buffer.cpp
@@ -0,0 +1,135 @@
+#include "utest_helper.hpp"
+
+void sub_buffer_check(void)
+{
+    cl_int error;
+    cl_ulong max_alloc_size;
+    cl_uint address_align;
+    cl_mem main_buf;
+    cl_mem sub_buf;
+    char *main_buf_content;
+    char sub_buf_content[32];
+
+    error = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_alloc_size), &max_alloc_size, NULL);
+    OCL_ASSERT(error == CL_SUCCESS);
+    error = clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(address_align ), &address_align, NULL );
+    OCL_ASSERT(error == CL_SUCCESS);
+
+    main_buf_content = (char *)malloc(sizeof(char) * max_alloc_size);
+
+    for (cl_ulong i = 0; i < max_alloc_size; i++) {
+        main_buf_content[i] = rand() & 63;
+    }
+
+    main_buf = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, max_alloc_size, main_buf_content, &error);
+    OCL_ASSERT(error == CL_SUCCESS);
+
+    /* Test read sub buffer. */
+    for (cl_ulong sz = 64; sz < max_alloc_size; sz*=4) {
+        for (cl_ulong off = 0; off < max_alloc_size; off += 1234) {
+            cl_buffer_region region;
+            region.origin = off;
+            region.size = sz;
+
+            sub_buf = clCreateSubBuffer(main_buf, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &error );
+
+            /* invalid size, should be failed. */
+            if(off + sz > max_alloc_size) {
+                OCL_ASSERT(error != CL_SUCCESS);
+                continue;
+            }
+            /* invalid align, should be failed. */
+            if(off & ((address_align/8)-1)) {
+                OCL_ASSERT(error != CL_SUCCESS);
+                continue;
+            }
+
+            OCL_ASSERT(error == CL_SUCCESS);
+
+            error = clEnqueueReadBuffer(queue, sub_buf, CL_TRUE, 0, 32, (void *)sub_buf_content, 0, NULL, NULL);
+            OCL_ASSERT(error == CL_SUCCESS);
+
+#if 0
+            printf("\nRead ########### Src buffer: \n");
+            for (int i = 0; i < 32; ++i)
+                printf(" %2.2u", main_buf_content[off + i]);
+
+            printf("\nRead ########### dst buffer: \n");
+            for (int i = 0; i < 32; ++i)
+                printf(" %2.2u", sub_buf_content[i]);
+            printf("\n");
+#endif
+            for (int i = 0; i < 32; ++i) {
+
+                if (main_buf_content[off + i] != sub_buf_content[i]) {
+                    printf ("different index is %d\n", i);
+                    OCL_ASSERT(0);
+                }
+            }
+
+        }
+    }
+
+
+    for (cl_ulong sz = 64; sz < max_alloc_size; sz*=4) {
+        for (cl_ulong off = 0; off < max_alloc_size; off += 1234) {
+            cl_buffer_region region;
+            region.origin = off;
+            region.size = sz;
+
+            sub_buf = clCreateSubBuffer(main_buf, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &error );
+
+            /* invalid size, should be failed. */
+            if(off + sz > max_alloc_size) {
+                OCL_ASSERT(error != CL_SUCCESS);
+                continue;
+            }
+            /* invalid align, should be failed. */
+            if(off & (address_align/8-1)) {
+                OCL_ASSERT(error != CL_SUCCESS);
+                continue;
+            }
+
+            OCL_ASSERT(error == CL_SUCCESS);
+
+            for (int i = 0; i < 32; i++) {
+                sub_buf_content[i] = rand() & 63;
+            }
+
+            error = clEnqueueWriteBuffer(queue, main_buf, CL_TRUE, off, 32, sub_buf_content, 0, NULL, NULL);
+            OCL_ASSERT(error == CL_SUCCESS);
+
+            void * mapped_ptr = clEnqueueMapBuffer(queue, sub_buf, CL_TRUE, (cl_map_flags)( CL_MAP_READ | CL_MAP_WRITE ),
+                    0, 32, 0, NULL, NULL, &error );
+            OCL_ASSERT(error == CL_SUCCESS);
+
+#if 0
+            printf("\nMap ########### Src buffer: \n");
+            for (int i = 0; i < 32; ++i)
+                printf(" %2.2u", sub_buf_content[i]);
+
+            printf("\nMap ########### dst buffer: \n");
+            for (int i = 0; i < 32; ++i)
+                printf(" %2.2u", ((char *)mapped_ptr)[i]);
+            printf("\n");
+#endif
+            for (int i = 0; i < 32; i++) {
+
+                if (((char *)mapped_ptr)[i] != sub_buf_content[i]) {
+                    printf ("different index is %d\n", i);
+                    OCL_ASSERT(0);
+                }
+            }
+
+            error = clEnqueueUnmapMemObject(queue, sub_buf, mapped_ptr, 0, NULL, NULL );
+            OCL_ASSERT(error == CL_SUCCESS);
+
+            clReleaseMemObject(sub_buf);
+        }
+    }
+
+    clReleaseMemObject(main_buf);
+    free(main_buf_content);
+}
+
+MAKE_UTEST_FROM_FUNCTION(sub_buffer_check);
diff --git a/utests/test_printf.cpp b/utests/test_printf.cpp
new file mode 100644
index 0000000..3601574
--- /dev/null
+++ b/utests/test_printf.cpp
@@ -0,0 +1,18 @@
+#include "utest_helper.hpp"
+
+void test_printf(void)
+{
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("test_printf");
+  globals[0] = 16;
+  locals[0] = 16;
+  globals[1] = 4;
+  locals[1] = 4;
+  globals[2] = 8;
+  locals[2] = 2;
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(3);
+}
+
+MAKE_UTEST_FROM_FUNCTION(test_printf);
diff --git a/utests/utest.cpp b/utests/utest.cpp
new file mode 100644
index 0000000..b491cae
--- /dev/null
+++ b/utests/utest.cpp
@@ -0,0 +1,183 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file utest.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "utest.hpp"
+#include "utest_helper.hpp"
+#include <vector>
+#include <string>
+#include <iostream>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <cstring>
+#include <stdlib.h>
+#include <csignal>
+
+struct signalMap
+{
+  const char* signalName;
+  int signalNum;
+};
+
+using namespace std;
+vector<UTest> *UTest::utestList = NULL;
+// Initialize and declare statistics struct
+RStatistics UTest::retStatistics;
+
+void releaseUTestList(void) { delete UTest::utestList; }
+void runSummaryAtExit(void) {
+  // If case crashes, count it as fail, and accumulate finishrun
+  if(UTest::retStatistics.finishrun != UTest::utestList->size()) {
+    UTest::retStatistics.finishrun++;
+    UTest::retStatistics.failCount++;
+  }
+  printf("\nsummary:\n----------\n");
+  printf("  total: %zu\n",UTest::utestList->size());
+  printf("  run: %zu\n",UTest::retStatistics.finishrun);
+  printf("  pass: %zu\n",UTest::retStatistics.passCount);
+  printf("  fail: %zu\n",UTest::retStatistics.failCount);
+  printf("  pass rate: %f\n",1-(float)UTest::retStatistics.failCount/(float)UTest::utestList->size());
+
+  releaseUTestList();
+}
+
+void signalHandler( int signum )
+{
+  const char* name = NULL;
+
+  signalMap arr[] = {
+    {"SIGILL",  SIGILL},
+    {"SIGFPE",  SIGFPE},
+    {"SIGABRT", SIGABRT},
+    {"SIGBUS",  SIGBUS},
+    {"SIGSEGV", SIGSEGV},
+    {"SIGHUP",  SIGHUP},
+    {"SIGINT",  SIGINT},
+    {"SIGQUIT", SIGQUIT},
+    {"SIGTERM", SIGTERM},
+    {NULL,      -1}
+  };
+
+  for(int i=0; arr[i].signalNum != -1 && arr[i].signalName != NULL; i++) {
+    if(arr[i].signalNum == signum)
+
+      name = arr[i].signalName;
+  }
+
+  printf("    Interrupt signal (%s) received.", name);
+
+  exit(signum);
+}
+
+void catch_signal(void){
+  struct sigaction sa;
+  int sigs[] = {
+    SIGILL, SIGFPE, SIGABRT, SIGBUS,
+    SIGSEGV, SIGHUP, SIGINT, SIGQUIT,
+    SIGTERM
+  };
+
+  sa.sa_handler = signalHandler;
+  sigemptyset(&sa.sa_mask);
+  sa.sa_flags = SA_RESETHAND;
+
+  for(unsigned int i = 0; i < sizeof(sigs)/sizeof(sigs[0]); ++i) {
+    if (sigaction(sigs[i], &sa, NULL) == -1)
+      perror("Could not set signal handler");
+  }
+}
+
+UTest::UTest(Function fn, const char *name, bool haveIssue, bool needDestroyProgram)
+       : fn(fn), name(name), haveIssue(haveIssue), needDestroyProgram(needDestroyProgram) {
+
+  if (utestList == NULL) {
+    utestList = new vector<UTest>;
+
+    catch_signal();
+    atexit(runSummaryAtExit);
+  }
+  utestList->push_back(*this);
+}
+
+
+static bool strequal(const char *s1, const char *s2) {
+  if (strcmp(s1, s2) == 0) return true;
+  return false;
+}
+
+void UTest::do_run(struct UTest utest){
+  // Print function name
+  printf("%s()", utest.name);
+  fflush(stdout);
+
+  // Run one case in utestList, print result [SUCCESS] or [FAILED]
+  (utest.fn)();
+}
+
+void UTest::run(const char *name) {
+  if (name == NULL) return;
+  if (utestList == NULL) return;
+
+  for (; retStatistics.finishrun < utestList->size(); ++retStatistics.finishrun) {
+    const UTest &utest = (*utestList)[retStatistics.finishrun];
+    if (utest.name == NULL || utest.fn == NULL ) continue;
+    if (strequal(utest.name, name)) {
+      do_run(utest);
+      cl_kernel_destroy(true);
+      cl_buffer_destroy();
+    }
+  }
+}
+
+void UTest::runAll(void) {
+  if (utestList == NULL) return;
+
+  for (; retStatistics.finishrun < utestList->size(); ++retStatistics.finishrun) {
+    const UTest &utest = (*utestList)[retStatistics.finishrun];
+    if (utest.fn == NULL) continue;
+    do_run(utest);
+    cl_kernel_destroy(utest.needDestroyProgram);
+    cl_buffer_destroy();
+  }
+}
+
+void UTest::runAllNoIssue(void) {
+  if (utestList == NULL) return;
+
+  for (; retStatistics.finishrun < utestList->size(); ++retStatistics.finishrun) {
+    const UTest &utest = (*utestList)[retStatistics.finishrun];
+    if (utest.fn == NULL || utest.haveIssue) continue;
+    do_run(utest);
+    cl_kernel_destroy(utest.needDestroyProgram);
+    cl_buffer_destroy();
+  }
+}
+
+void UTest::listAllCases()
+{
+  if (utestList == NULL) return;
+    for (size_t i = 0; i < utestList->size(); ++i) {
+      const UTest &utest = (*utestList)[i];
+      if (utest.fn == NULL) continue;
+    std::cout << utest.name << std::endl;
+ }
+}
diff --git a/utests/utest.hpp b/utests/utest.hpp
new file mode 100644
index 0000000..375ef70
--- /dev/null
+++ b/utests/utest.hpp
@@ -0,0 +1,139 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file utest.hpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * Provides all unit test capabilites. It is rather rudimentary but it should
+ * do the job
+ */
+#ifndef __UTEST_UTEST_HPP__
+#define __UTEST_UTEST_HPP__
+
+#include "utest_exception.hpp"
+#include <vector>
+#include <iostream>
+
+/*! struct for statistics */
+struct RStatistics
+{
+  size_t passCount;
+  size_t failCount;
+  size_t finishrun;
+};
+
+/*! Quick and dirty unit test system with registration */
+struct UTest
+{
+  /*! A unit test function to run */
+  typedef void (*Function) (void);
+  /*! Empty test */
+  UTest(void);
+  /*! Build a new unit test and append it to the unit test list */
+  UTest(Function fn, const char *name, bool haveIssue = false, bool needDestroyProgram = true);
+  /*! Function to execute */
+  Function fn;
+  /*! Name of the test */
+  const char *name;
+  /*! Indicate whether current test cases has issue to be fixes */
+  bool haveIssue;
+  /*! Indicate whether destroy kernels/program. */
+  bool needDestroyProgram;
+  /*! The tests that are registered */
+  static std::vector<UTest> *utestList;
+  /*! Run the test with the given name */
+  static void run(const char *name);
+  /*! Run all the tests without known issue*/
+  static void runAllNoIssue(void);
+  /*! Run all the tests */
+  static void runAll(void);
+  /*! List all test cases */
+  static void listAllCases(void);
+  /*! Statistics struct */
+  static RStatistics retStatistics;
+  /*! Do run a test case actually */
+  static void do_run(struct UTest utest);
+};
+
+/*! Register a new unit test */
+#define UTEST_REGISTER(FN) static const UTest __##FN##__(FN, #FN);
+
+#define MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(FN, KEEP_PROGRAM) \
+  static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
+  static const UTest __##FN##__(__ANON__##FN##__, #FN, false, !(KEEP_PROGRAM));
+
+
+/*! Turn a function into a unit test */
+#define MAKE_UTEST_FROM_FUNCTION(FN) \
+  static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
+  static const UTest __##FN##__(__ANON__##FN##__, #FN);
+
+/*! Register a test case which has issue to be fixed */
+#define MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(FN) \
+  static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
+  static const UTest __##FN##__(__ANON__##FN##__, #FN, true);
+
+/*! Turn a function into a unit performance test */
+#define MAKE_BENCHMARK_FROM_FUNCTION(FN) \
+  static void __ANON__##FN##__(void) { BENCHMARK(FN()); } \
+  static const UTest __##FN##__(__ANON__##FN##__, #FN);
+
+/*! No assert is expected */
+#define UTEST_EXPECT_SUCCESS(EXPR) \
+ do { \
+    try { \
+      EXPR; \
+      std::cout << "    [SUCCESS]" << std::endl; \
+      UTest::retStatistics.passCount += 1; \
+    } \
+    catch (Exception e) { \
+      std::cout << "    [FAILED]" << std::endl; \
+      std::cout << "    " << e.what() << std::endl; \
+      UTest::retStatistics.failCount++; \
+    } \
+  } while (0)
+
+#define UTEST_EXPECT_FAILED(EXPR) \
+ do { \
+    try { \
+      EXPR; \
+      std::cout << "    [FAILED]" << std::endl; \
+      retStatistics.failCount++; \
+    } \
+    catch (gbe::Exception e) { \
+      std::cout << "    [SUCCESS]" << std::endl; \
+      retStatistics.passCount++; \
+    } \
+  } while (0)
+
+#define BENCHMARK(EXPR) \
+ do { \
+    int ret = 0; \
+    try { \
+      ret = EXPR; \
+      printf("  %s  [SUCCESS] [Result: %d]\n", #EXPR, ret);\
+    } \
+    catch (Exception e) { \
+      std::cout << "  " << #EXPR << "    [FAILED]" << std::endl; \
+      std::cout << "    " << e.what() << std::endl; \
+    } \
+  } while (0)
+#endif /* __UTEST_UTEST_HPP__ */
+
diff --git a/utests/utest_assert.cpp b/utests/utest_assert.cpp
new file mode 100644
index 0000000..f3b9a00
--- /dev/null
+++ b/utests/utest_assert.cpp
@@ -0,0 +1,41 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file assert.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#include "utest_assert.hpp"
+#include "utest_exception.hpp"
+#include <cassert>
+#include <cstdlib>
+
+void onFailedAssertion(const char *msg, const char *file, const char *fn, int line)
+{
+  char lineString[256];
+  sprintf(lineString, "%i", line);
+  assert(msg != NULL && file != NULL && fn != NULL);
+  const std::string str = "Error: "
+                        + std::string(msg) + "\n  at file "
+                        + std::string(file)
+                        + ", function " + std::string(fn)
+                        + ", line " + std::string(lineString);
+  throw Exception(str);
+}
+
diff --git a/utests/utest_assert.hpp b/utests/utest_assert.hpp
new file mode 100644
index 0000000..f93f9ac
--- /dev/null
+++ b/utests/utest_assert.hpp
@@ -0,0 +1,44 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file assert.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __OCL_ASSERT_HPP__
+#define __OCL_ASSERT_HPP__
+
+/*! To ensure that condition truth. Optional message is supported */
+void onFailedAssertion(const char *msg, const char *file, const char *fn, int line);
+
+#define OCL_ASSERT(EXPR) \
+  do { \
+    if (!(EXPR)) \
+      onFailedAssertion(#EXPR, __FILE__, __FUNCTION__, __LINE__); \
+  } while (0)
+
+#define OCL_ASSERTM(EXPR, MSG) \
+  do { \
+    if (!(EXPR)) \
+      onFailedAssertion(MSG, __FILE__, __FUNCTION__, __LINE__); \
+  } while (0)
+
+#endif /* __OCL_ASSERT_HPP__ */
+
diff --git a/utests/utest_error.c b/utests/utest_error.c
new file mode 100644
index 0000000..4582a33
--- /dev/null
+++ b/utests/utest_error.c
@@ -0,0 +1,76 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_error.h"
+#include "CL/cl.h"
+
+const char *err_msg[] = {
+  [-CL_SUCCESS] = "CL_SUCCESS",
+  [-CL_DEVICE_NOT_FOUND] = "CL_DEVICE_NOT_FOUND",
+  [-CL_DEVICE_NOT_AVAILABLE] = "CL_DEVICE_NOT_AVAILABLE",
+  [-CL_COMPILER_NOT_AVAILABLE] = "CL_COMPILER_NOT_AVAILABLE",
+  [-CL_MEM_OBJECT_ALLOCATION_FAILURE] = "CL_MEM_OBJECT_ALLOCATION_FAILURE",
+  [-CL_OUT_OF_RESOURCES] = "CL_OUT_OF_RESOURCES",
+  [-CL_OUT_OF_HOST_MEMORY] = "CL_OUT_OF_HOST_MEMORY",
+  [-CL_PROFILING_INFO_NOT_AVAILABLE] = "CL_PROFILING_INFO_NOT_AVAILABLE",
+  [-CL_MEM_COPY_OVERLAP] = "CL_MEM_COPY_OVERLAP",
+  [-CL_IMAGE_FORMAT_MISMATCH] = "CL_IMAGE_FORMAT_MISMATCH",
+  [-CL_IMAGE_FORMAT_NOT_SUPPORTED] = "CL_IMAGE_FORMAT_NOT_SUPPORTED",
+  [-CL_BUILD_PROGRAM_FAILURE] = "CL_BUILD_PROGRAM_FAILURE",
+  [-CL_MAP_FAILURE] = "CL_MAP_FAILURE",
+  [-CL_MISALIGNED_SUB_BUFFER_OFFSET] = "CL_MISALIGNED_SUB_BUFFER_OFFSET",
+  [-CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST] = "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST",
+  [-CL_INVALID_VALUE] = "CL_INVALID_VALUE",
+  [-CL_INVALID_DEVICE_TYPE] = "CL_INVALID_DEVICE_TYPE",
+  [-CL_INVALID_PLATFORM] = "CL_INVALID_PLATFORM",
+  [-CL_INVALID_DEVICE] = "CL_INVALID_DEVICE",
+  [-CL_INVALID_CONTEXT] = "CL_INVALID_CONTEXT",
+  [-CL_INVALID_QUEUE_PROPERTIES] = "CL_INVALID_QUEUE_PROPERTIES",
+  [-CL_INVALID_COMMAND_QUEUE] = "CL_INVALID_COMMAND_QUEUE",
+  [-CL_INVALID_HOST_PTR] = "CL_INVALID_HOST_PTR",
+  [-CL_INVALID_MEM_OBJECT] = "CL_INVALID_MEM_OBJECT",
+  [-CL_INVALID_IMAGE_FORMAT_DESCRIPTOR] = "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
+  [-CL_INVALID_IMAGE_SIZE] = "CL_INVALID_IMAGE_SIZE",
+  [-CL_INVALID_SAMPLER] = "CL_INVALID_SAMPLER",
+  [-CL_INVALID_BINARY] = "CL_INVALID_BINARY",
+  [-CL_INVALID_BUILD_OPTIONS] = "CL_INVALID_BUILD_OPTIONS",
+  [-CL_INVALID_PROGRAM] = "CL_INVALID_PROGRAM",
+  [-CL_INVALID_PROGRAM_EXECUTABLE] = "CL_INVALID_PROGRAM_EXECUTABLE",
+  [-CL_INVALID_KERNEL_NAME] = "CL_INVALID_KERNEL_NAME",
+  [-CL_INVALID_KERNEL_DEFINITION] = "CL_INVALID_KERNEL_DEFINITION",
+  [-CL_INVALID_KERNEL] = "CL_INVALID_KERNEL",
+  [-CL_INVALID_ARG_INDEX] = "CL_INVALID_ARG_INDEX",
+  [-CL_INVALID_ARG_VALUE] = "CL_INVALID_ARG_VALUE",
+  [-CL_INVALID_ARG_SIZE] = "CL_INVALID_ARG_SIZE",
+  [-CL_INVALID_KERNEL_ARGS] = "CL_INVALID_KERNEL_ARGS",
+  [-CL_INVALID_WORK_DIMENSION] = "CL_INVALID_WORK_DIMENSION",
+  [-CL_INVALID_WORK_GROUP_SIZE] = "CL_INVALID_WORK_GROUP_SIZE",
+  [-CL_INVALID_WORK_ITEM_SIZE] = "CL_INVALID_WORK_ITEM_SIZE",
+  [-CL_INVALID_GLOBAL_OFFSET] = "CL_INVALID_GLOBAL_OFFSET",
+  [-CL_INVALID_EVENT_WAIT_LIST] = "CL_INVALID_EVENT_WAIT_LIST",
+  [-CL_INVALID_EVENT] = "CL_INVALID_EVENT",
+  [-CL_INVALID_OPERATION] = "CL_INVALID_OPERATION",
+  [-CL_INVALID_GL_OBJECT] = "CL_INVALID_GL_OBJECT",
+  [-CL_INVALID_BUFFER_SIZE] = "CL_INVALID_BUFFER_SIZE",
+  [-CL_INVALID_MIP_LEVEL] = "CL_INVALID_MIP_LEVEL",
+  [-CL_INVALID_GLOBAL_WORK_SIZE] = "CL_INVALID_GLOBAL_WORK_SIZE",
+  [-CL_INVALID_PROPERTY] = "CL_INVALID_PROPERTY"
+};
+const size_t err_msg_n = sizeof(err_msg) / sizeof(err_msg[0]);
+
diff --git a/utests/utest_error.h b/utests/utest_error.h
new file mode 100644
index 0000000..2da29b0
--- /dev/null
+++ b/utests/utest_error.h
@@ -0,0 +1,26 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __UTEST_ERROR_H__
+#define __UTEST_ERROR_H__
+#include <stdlib.h>
+extern const char *err_msg[];
+extern const size_t err_msg_n;
+#endif /* __UTEST_ERROR_H__ */
+
diff --git a/utests/utest_exception.hpp b/utests/utest_exception.hpp
new file mode 100644
index 0000000..e19141f
--- /dev/null
+++ b/utests/utest_exception.hpp
@@ -0,0 +1,48 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file exception.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __UTEST_EXCEPTION_HPP__
+#define __UTEST_EXCEPTION_HPP__
+
+#include <string>
+#include <exception>
+
+/*! Exception are only used while using unit tests */
+class Exception : public std::exception
+{
+public:
+  Exception(const std::string &msg) throw() : msg(msg) {}
+  Exception(const Exception &other) throw() : msg(other.msg) {}
+  ~Exception(void) throw() {}
+  Exception &operator= (const Exception &other) throw() {
+    this->msg = other.msg;
+    return *this;
+  }
+  const char *what(void) const throw() { return msg.c_str(); }
+private:
+  std::string msg; //!< String message
+};
+
+#endif /* __UTEST_EXCEPTION_HPP__ */
+
diff --git a/utests/utest_file_map.cpp b/utests/utest_file_map.cpp
new file mode 100644
index 0000000..55b7771
--- /dev/null
+++ b/utests/utest_file_map.cpp
@@ -0,0 +1,117 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_file_map.hpp"
+#include "CL/cl.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+
+int
+cl_file_map_init(cl_file_map_t *fm)
+{
+  assert(fm);
+  memset(fm,0,sizeof(*fm));
+  return CL_SUCCESS;
+}
+
+void
+cl_file_map_destroy(cl_file_map_t *fm)
+{
+  if (fm->mapped) {
+    munmap(fm->start, fm->size);
+    fm->start = fm->stop = 0;
+    fm->size = 0;
+    fm->mapped = CL_FALSE;
+  }
+  if(fm->fd) {
+    close(fm->fd);
+    fm->fd = 0;
+  }
+  free(fm->name);
+  memset(fm,0,sizeof(*fm));
+}
+
+void
+cl_file_map_delete(cl_file_map_t *fm)
+{
+  if (fm == NULL)
+    return;
+  cl_file_map_destroy(fm);
+  free(fm);
+}
+
+cl_file_map_t*
+cl_file_map_new(void)
+{
+  cl_file_map_t *fm = NULL;
+
+  if ((fm = (cl_file_map_t *) calloc(1, sizeof(cl_file_map_t))) == NULL)
+    goto error;
+  if (cl_file_map_init(fm) != CL_SUCCESS)
+    goto error;
+
+exit:
+  return fm;
+error:
+  cl_file_map_delete(fm);
+  fm = NULL;
+  goto exit;
+}
+
+int
+cl_file_map_open(cl_file_map_t *fm, const char *name)
+{
+  int err = CL_FILE_MAP_SUCCESS;
+
+  /* Open the file */
+  fm->fd = open(name, O_RDONLY);
+  if(fm->fd < 0) {
+    err = CL_FILE_MAP_FILE_NOT_FOUND;
+    goto error;
+  }
+  if ((fm->name = (char*) calloc(strlen(name) + 1, sizeof(char))) == NULL)
+    goto error;
+  sprintf(fm->name, "%s", name);
+
+  /* Map it */
+  fm->size = lseek(fm->fd, 0, SEEK_END);
+  lseek(fm->fd, 0, SEEK_SET);
+  fm->start = mmap(0, fm->size, PROT_READ, MAP_SHARED, fm->fd, 0);
+  if(fm->start == NULL) {
+    err = CL_FILE_MAP_FAILED_TO_MMAP;
+    goto error;
+  }
+
+  fm->stop = ((char *) fm->start) + fm->size;
+  fm->mapped = CL_TRUE;
+
+exit:
+  return err;
+error:
+  cl_file_map_destroy(fm);
+  goto exit;
+}
+
diff --git a/utests/utest_file_map.hpp b/utests/utest_file_map.hpp
new file mode 100644
index 0000000..83d79ea
--- /dev/null
+++ b/utests/utest_file_map.hpp
@@ -0,0 +1,84 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file assert.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#ifndef __UTEST_FILE_MAP_HPP__
+#define __UTEST_FILE_MAP_HPP__
+
+#include "CL/cl.h"
+#include <cstdlib>
+
+/* Map a file into memory for direct / cached / simple accesses */
+typedef struct cl_file_map {
+  void *start, *stop; /* First character and last one */
+  size_t size;        /* Total size of the file */
+  int fd;             /* Posix file descriptor */
+  cl_bool mapped;     /* Indicate if a file was mapped or not */
+  char *name;         /* File itself */
+} cl_file_map_t;
+
+/* Report information about an open temptative */
+enum {
+  CL_FILE_MAP_SUCCESS         = 0,
+  CL_FILE_MAP_FILE_NOT_FOUND  = 1,
+  CL_FILE_MAP_FAILED_TO_MMAP  = 2
+};
+
+/* Allocate and Initialize a file mapper (but do not map any file */
+extern cl_file_map_t *cl_file_map_new(void);
+
+/* Initialize a file mapper (but do not map any file */
+extern int cl_file_map_init(cl_file_map_t *fm);
+
+/* Destroy but do not deallocate a file map */
+extern void cl_file_map_destroy(cl_file_map_t *fm);
+
+/* Destroy and free it */
+extern void cl_file_map_delete(cl_file_map_t *fm);
+
+/* Open a file and returns the error code */
+extern int cl_file_map_open(cl_file_map_t *fm, const char *name);
+
+static inline cl_bool
+cl_file_map_is_mapped(const cl_file_map_t *fm) {
+  return fm->mapped;
+}
+
+static inline const char*
+cl_file_map_begin(const cl_file_map_t *fm) {
+  return (const char*) fm->start;
+}
+
+static inline const char*
+cl_file_map_end(const cl_file_map_t *fm) {
+  return (const char*) fm->stop;
+}
+
+static inline size_t
+cl_file_map_size(const cl_file_map_t *fm) {
+  return fm->size;
+}
+
+#endif /* __UTEST_FILE_MAP_HPP__ */
+
diff --git a/utests/utest_generator.py b/utests/utest_generator.py
new file mode 100644
index 0000000..7522001
--- /dev/null
+++ b/utests/utest_generator.py
@@ -0,0 +1,387 @@
+#!/usr/bin/python
+import os,sys,re
+
+FLT_MAX_POSI='0x1.fffffep127f'
+FLT_MIN_NEGA='-0x1.fffffep127f'
+FLT_MIN_POSI='0x1.0p-126f'
+FLT_MAX_NEGA='-0x1.0p-126f'
+
+paraTypeList={'float':'%e','int':'%d','double':'%lf','uint':'%d','string':'%s'}
+
+
+def ulpUnit(ulpSize):
+  return re.findall(r"([a-zA-Z_]+)",ulpSize)[0]
+
+def ulpNum(ulpSize):
+  return re.findall(r"([0-9]+)",ulpSize)[0]
+
+def udebug(ulpSize,returnType):
+  #ulpUnit=re.findall(r"([a-zA-Z_]+)",ulpSize)[0]
+  #ulpNum=re.findall(r"([0-9]+)",ulpSize)[0]
+  text='''
+    static const char* INFORNAN;
+    static %s ULPSIZE, ULPSIZE_FACTOR;
+
+    const char* env_strict = getenv("OCL_STRICT_CONFORMANCE");
+
+    if (env_strict == NULL || strcmp(env_strict, "0") == 0)
+      ULPSIZE_FACTOR = 1000;
+    else
+      ULPSIZE_FACTOR = 1;
+    
+    if (isinf(cpu_data[index])){
+      INFORNAN="INF";
+    }
+    else if (isnan(cpu_data[index])){
+      INFORNAN="NAN";
+    }
+    else{
+       ULPSIZE=ULPSIZE_FACTOR * cl_%s((cpu_data[index] == 0) ? 1 : cpu_data[index])
+               * ((ULPSIZE_FACTOR == 1) ? %s : ( (%s == 0) ? 1 : %s));
+    }
+
+#if udebug 
+    if (isinf(cpu_data[index])){ 
+      if (isinf(gpu_data[index]))
+        printf("%s expect:%s\\n", log, INFORNAN);
+      else
+        printf_c("%s expect:%s\\n", log, INFORNAN);
+      }
+    else if (isnan(cpu_data[index])){
+      if (isnan(gpu_data[index]))
+        printf("%s expect:%s\\n", log, INFORNAN);
+      else
+        printf_c("%s expect:%s\\n", log, INFORNAN);
+      }
+    else if (diff <= ULPSIZE){
+      printf("%s expect:%s\\n", log, ULPSIZE);
+      }
+    else
+      printf_c("%s expect:%s\\n", log, ULPSIZE);
+#else
+    if (isinf(cpu_data[index])){
+      sprintf(log, "%s expect:%s\\n", log, INFORNAN);
+      OCL_ASSERTM(isinf(gpu_data[index]),log);
+      }
+    else if (isnan(cpu_data[index])){
+      sprintf(log, "%s expect:%s\\n", log, INFORNAN);
+      OCL_ASSERTM(isnan(gpu_data[index]),log);
+      }
+    else{
+      sprintf(log, "%s expect:%s\\n", log, ULPSIZE);
+      OCL_ASSERTM(fabs(gpu_data[index]-cpu_data[index]) <= ULPSIZE, log);
+      }
+#endif
+  }
+}\n'''%(returnType,\
+        ulpUnit(ulpSize),ulpNum(ulpSize),\
+        ulpNum(ulpSize), ulpNum(ulpSize),\
+        paraTypeList['string'],paraTypeList['string'],\
+        paraTypeList['string'],paraTypeList['string'],\
+        paraTypeList['string'],paraTypeList['string'],\
+        paraTypeList['string'],paraTypeList['string'],\
+        paraTypeList['string'],paraTypeList['%s'%(returnType)],\
+        paraTypeList['string'],paraTypeList['%s'%(returnType)],\
+        paraTypeList['string'],paraTypeList['string'],\
+        paraTypeList['string'],paraTypeList['string'],\
+        paraTypeList['string'],paraTypeList['%s'%(returnType)])
+
+  return text
+
+def gene2ValuesLoop(values1,values2,inputValues):
+  values2=values2+inputValues*len(inputValues)
+
+  for i in inputValues:
+    for j in range(0,len(inputValues)):
+      values1 += [i]
+
+  return values1,values2
+
+def gene3ValuesLoop(values1,values2,values3,inputValues):
+  for i in inputValues:
+    for j in range(0,len(inputValues)):
+      for k in range(0,len(inputValues)):
+        values1 += [i]
+
+  for i in inputValues:
+    for j in inputValues:
+      for k in range(0,len(inputValues)):
+        values2 += [j]
+
+  values3=inputValues*(len(inputValues)**2)
+  return values1,values2,values3
+
+class func:
+  """ This class will define all needed instance attribute in fundation a c programing file. """
+
+  def __init__(self,name,cpuFuncName,inputType,outputType,values,ulp, cpu_func=''):
+    self.funcName = name
+    self.cpuFuncName = cpuFuncName
+    self.fileName = 'builtin_'+name
+    self.inputtype = inputType
+    self.outputtype = outputType
+    self.values = values
+    self.ulp = ulp
+    self.cpufunc=cpu_func
+    self.cpplines = []
+    
+#####cpp file required information:
+    self.Head='''/*
+This file is generated by utest_generator.py.
+Usually you need NOT modify this file manually.
+But when any bug occured, you can change the value of udebug from 0 to 1,
+which can print more values and information to assist debuging the issue.
+*/
+
+#include "utest_helper.hpp"
+#include <stdio.h>
+#include <math.h>
+#include <algorithm>
+#include <string.h>
+
+#define udebug 0
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define INT_ULP 0
+
+#define printf_c(...) \\
+{\\
+  printf("\\033[1m\\033[40;31m");\\
+  printf( __VA_ARGS__ );\\
+  printf("\\033[0m");\\
+}
+'''
+    #########Execute class itself
+    self.geneProcess()
+
+#####Computer vector && argument type:
+  def argtype(self,paraN,index):
+    return re.findall(r"[a-zA-Z_]+",self.inputtype[paraN][index])[0]
+
+  def argvector(self,paraN,index):
+    vector=re.findall(r"[0-9]+",self.inputtype[paraN][index])
+    if vector:
+      vector=vector[0]
+    else:
+      vector=1
+    return vector
+
+  def returnVector(self,index):
+    returnVector=re.findall(r"[0-9]+",self.outputtype[index])
+    if returnVector:
+      returnVector=returnVector[0]
+    else:
+      returnVector=1
+    return returnVector
+
+  def retType(self,index):
+    return re.findall("[a-zA-Z_]+",self.outputtype[index])[0]
+
+  def inputNumFormat(self,paraN,index):
+    return paraTypeList['%s'%(self.argtype(paraN,index))]
+
+  def outputNumFormat(self,index):
+    return paraTypeList['%s'%(self.retType(index))]
+
+#####Cpu values analyse
+  def GenInputValues(self,index):
+    #namesuffix=self.inputtype[0][index]
+    for i in range(0,self.values.__len__()):
+      self.cpplines += [ "const %s input_data%d[] = {%s};" %(self.argtype(i,index),i+1,str(self.values[i]).strip('[]').replace('\'','')) ]
+    self.cpplines += [ "const int count_input = sizeof(input_data1) / sizeof(input_data1[0]);" ]
+    self.cpplines += [ "const int vector = %s;\n"%(self.argvector(self.inputtype.__len__()-1,index)) ]
+
+#####Cpu Function
+  def GenCpuCompilerMath(self,index):
+    #namesuffix=self.inputtype[0][index]
+    defline='static void cpu_compiler_math(%s *dst, '%(self.retType(index))
+    cpufunargs='('
+    funcline = ['{']
+    vectorargs=[]
+
+    if (self.returnVector(index) == 1 and self.argvector(0,index) != 1):
+      for i in range(0,self.values.__len__()):
+        defline += 'const %s *src%d'%(self.argtype(i,index),i+1)
+        defline += ( i == self.values.__len__()-1 ) and ')' or ','
+        vectorargs.append('(')
+      for i in range(0,self.values.__len__()):
+        for j in range(0,self.vector):
+          vectorargs += "x%d%d"%(i+1,j+1)
+          vectorargs += ( j == self.vector-1 ) and ');' or ','
+          funcline += ["  const %s x%d%d = *(src%d+%d);"%(self.argtype(i,index),i+1,j+1,i+1,j)]
+
+      return 0
+
+    for i in range(0,self.values.__len__()):
+      defline += 'const %s *src%d'%(self.argtype(i,index),i+1)
+      defline += ( i == self.values.__len__()-1 ) and ')' or ','
+      cpufunargs += "x%d"%(i+1)
+      cpufunargs += ( i == self.values.__len__()-1 ) and ');' or ','
+      funcline += ["  const %s x%d = *src%d;"%(self.argtype(i,index),i+1,i+1)]
+
+    funcline += [ "  dst[0] = %s%s"%(self.cpuFuncName, cpufunargs) ]
+    funcline += [ '}'] 
+
+    funcline = [defline] + funcline
+
+    self.cpplines += funcline
+#    self.writeCPP( '\n'.join(funcline), 'a', namesuffix)
+
+  def writeCPP(self,content,authority,namesuffix):
+    file_object = open("generated/%s_%s.cpp"%(self.fileName,namesuffix),authority)
+    file_object.writelines(content)
+    file_object.close()
+
+  def writeCL(self,content,authority,namesuffix):
+    file_object = open(os.getcwd()+"/../kernels/%s_%s.cl"%(self.fileName,namesuffix),authority)
+    file_object.writelines(content)
+    file_object.close()
+
+  def nameForCmake(self,content,namesuffix):
+    print("generated/%s_%s.cpp"%(self.fileName,namesuffix)),
+
+  def utestFunc(self,index):
+    funcLines=[]
+    namesuffix=self.inputtype[0][index]
+    funcline=[]
+    funchead='''
+static void %s_%s(void)
+{
+  int index;
+  %s gpu_data[count_input] = {0}, cpu_data[count_input] = {0}, diff=0.0;
+  char log[1024] = {0};
+
+  OCL_CREATE_KERNEL(\"%s_%s\");
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * sizeof(%s), NULL); 
+
+  globals[0] = count_input;
+  locals[0] = 1;
+ '''%(self.fileName,namesuffix,\
+     self.retType(index),\
+     self.fileName, namesuffix,\
+     self.retType(index))
+
+    funcline += [funchead]
+    for i in range(1,self.values.__len__()+1): 
+      funcline += ["  OCL_CREATE_BUFFER(buf[%d], CL_MEM_READ_WRITE, count_input * sizeof(%s), NULL);"%(i,self.argtype(i-1,index))]
+      funcline += ["  clEnqueueWriteBuffer( queue, buf[%d], CL_TRUE, 0, count_input * sizeof(%s), input_data%d, 0, NULL, NULL);"%(i,self.argtype(i-1,index),i)]
+
+    funcline += ["  OCL_CREATE_BUFFER(buf[%d], CL_MEM_READ_WRITE, sizeof(int), NULL);"%(self.inputtype.__len__()+1)]
+    funcline += ["  clEnqueueWriteBuffer( queue, buf[%d], CL_TRUE, 0, sizeof(int), &vector, 0, NULL, NULL);"%(self.inputtype.__len__()+1)]
+
+	#0=output 1=input1 2=input2 ... len+2=output
+    for i in range(0,self.values.__len__()+2): 
+      funcline += ["  OCL_SET_ARG(%d, sizeof(cl_mem), &buf[%d]);"%(i,i)]
+
+    funcrun='''
+  // Run the kernel:
+  OCL_NDRANGE( 1 );
+  clEnqueueReadBuffer( queue, buf[0], CL_TRUE, 0, sizeof(%s) * count_input, gpu_data, 0, NULL, NULL);
+'''%(self.inputtype.__len__()+1)
+    funcline += [ funcrun ]
+
+    funcsprintfa='    sprintf(log, \"'
+    funcsprintfb=''
+    if (self.returnVector(index) == 1 and self.argvector(0,index) != 1):
+      funccompare='''
+  for (index = 0; index < count_input/vector; index++)
+  {
+    cpu_compiler_math( cpu_data + index, '''
+    else:
+      funccompare='''
+  for (index = 0; index < count_input; index++)
+  {
+    cpu_compiler_math( cpu_data + index,'''
+
+    for i in range(0,self.values.__len__()):
+      funccompare += " input_data%d + index"%(i+1)
+      funccompare += (self.values.__len__() - 1 == i) and ');' or ','
+
+      funcsprintfa += "input_data%d:"%(i+1)
+      funcsprintfa += "%s "%(self.inputNumFormat(i,index))
+      funcsprintfb += " input_data%d[index],"%(i+1)
+
+    funcline += [ funccompare ]
+
+    funcsprintfa += " -> gpu:%s  cpu:%s diff:%s\","%(self.outputNumFormat(index),self.outputNumFormat(index),self.outputNumFormat(index))#,self.outputNumFormat(index))
+    funcsprintfb += " gpu_data[index], cpu_data[index], diff);"#%(ulpUnit(self.ulp),ulpNum(self.ulp))
+
+    #funcdiff = "    diff = fabs((gpu_data[index]-cpu_data[index])"
+    #funcdiff += (self.retType(index) == "int") and ');' or '/(cpu_data[index]>1?cpu_data[index]:1));'
+    valuejudge = "    if (std::fpclassify(gpu_data[index]) == FP_SUBNORMAL){ gpu_data[index] = 0; }\n"
+    valuejudge += "    if (std::fpclassify(cpu_data[index]) == FP_SUBNORMAL){ cpu_data[index] = 0; }\n"
+    funcdiff = "    diff = fabs((gpu_data[index]-cpu_data[index]));"
+    funcline += [ valuejudge ]
+    funcline += [ funcdiff ]
+    funcline += [ funcsprintfa + funcsprintfb ]
+
+    self.cpplines += funcline
+
+    self.cpplines += [ udebug(self.ulp,self.retType(index)) ]
+    self.cpplines += [ "MAKE_UTEST_FROM_FUNCTION(%s_%s)"%(self.fileName,namesuffix) ]
+
+  def genCL(self,index):
+    namesuffix=self.inputtype[0][index]
+    clLine = []
+    clhead = '__kernel void %s_%s(__global %s *dst, '%(self.fileName,namesuffix,self.retType(index))
+    clvalueDef=''
+    clcomputer=''
+    tmp=''
+
+    for i in range(0,self.values.__len__()):
+      clhead += ' __global %s *src%d,'%(self.argtype(i,index),i+1)
+      clvalueDef +=   '  %s x%d = (%s) ('%(self.inputtype[i][index],i+1,self.inputtype[i][index])
+      tmp = 'src%d[i * (*vector) + '%(i+1)
+      for j in range(0,int(self.argvector(i,index))):
+        clvalueDef += tmp + ((int(self.argvector(i-1,index)) == j+1 ) and '%d]);\n'%(j) or '%d],'%(j))
+      clcomputer += (self.values.__len__() == i+1) and 'x%d);'%(i+1) or 'x%d,'%(i+1)
+      
+    clhead += ' __global int *vector) {\n'
+    clhead += '  int i = get_global_id(0);'
+    clLine += [ clhead ]
+    clLine += [ clvalueDef ]
+    clLine += [ '  %s ret;'%(self.outputtype[index]) ]
+    clLine += [ '  ret = %s('%(self.funcName) + clcomputer ] 
+
+    if (int(self.returnVector(index)) == 1):
+      clLine += [ '  dst[i] = ret;' ]
+    else:
+      for i in range(0,int(self.returnVector(index))):
+        clLine += [ '  dst[i * (*vector) + %d] = ret[%d];'%(i,i) ]
+    clLine += [ '};' ]
+
+    self.writeCL('\n'.join(clLine),'w',namesuffix)
+  
+  def geneProcess(self):
+    for i in range(0,self.inputtype[0].__len__()):
+##########Write Cpp file          
+      namesuffix=self.inputtype[0][i]
+      self.cpplines = []
+      #The head:
+      self.cpplines += [self.Head]
+
+      #Parameters:
+      self.GenInputValues(i)
+
+      #cpu function generator:
+      self.cpplines += [self.cpufunc]
+
+      #Cpu function:
+      self.GenCpuCompilerMath(i)
+
+      #utest function
+      self.utestFunc(i)
+
+      #kernel cl
+      self.genCL(i)
+
+      #CMakelists.txt
+      self.nameForCmake(self.fileName,namesuffix)
+
+      self.writeCPP( '\n'.join(self.cpplines) ,'w',namesuffix)
+#########End
+
+#def main():
+#
+#if __name__ == "__main__":
+#  main()
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
new file mode 100644
index 0000000..cb4dd66
--- /dev/null
+++ b/utests/utest_helper.cpp
@@ -0,0 +1,674 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+#include "utest_file_map.hpp"
+#include "utest_helper.hpp"
+#include "utest_error.h"
+#include "CL/cl.h"
+#include "CL/cl_intel.h"
+
+#include <cstdio>
+#include <cstdint>
+#include <cstring>
+#include <cassert>
+#include <cmath>
+
+#define FATAL(...) \
+do { \
+  fprintf(stderr, "error: "); \
+  fprintf(stderr, __VA_ARGS__); \
+  fprintf(stderr, "\n");\
+  assert(0); \
+  exit(-1); \
+} while (0)
+
+#define FATAL_IF(COND, ...) \
+do { \
+  if (COND) FATAL(__VA_ARGS__); \
+} while (0)
+
+cl_platform_id platform = NULL;
+cl_device_id device = NULL;
+cl_context ctx = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_command_queue queue = NULL;
+cl_mem buf[MAX_BUFFER_N] = {};
+void *buf_data[MAX_BUFFER_N] = {};
+size_t globals[3] = {};
+size_t locals[3] = {};
+
+#ifdef HAS_EGL
+Display    *xDisplay;
+EGLDisplay  eglDisplay;
+EGLContext  eglContext = NULL;
+EGLSurface  eglSurface;
+Window xWindow;
+
+void cl_ocl_destroy_egl_window() {
+    eglMakeCurrent(eglDisplay, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT);
+    eglDestroyContext(eglDisplay, eglContext);
+    eglDestroySurface(eglDisplay, eglSurface);
+    XDestroyWindow(xDisplay, xWindow);
+    XCloseDisplay(xDisplay);
+}
+
+bool init_egl_window(int width, int height) {
+    XSetWindowAttributes swa;
+    Window      win, root;
+    EGLint attr[] = {       // some attributes to set up our egl-interface
+    EGL_BUFFER_SIZE, 16,
+    EGL_RENDERABLE_TYPE,
+    EGL_OPENGL_BIT,
+    EGL_NONE
+    };
+    //// egl-contexts collect all state descriptions needed required for operation
+    EGLint ctxattr[] = {
+            #if 0
+            EGL_CONTEXT_CLIENT_VERSION, 2,
+            #endif
+            EGL_NONE
+    };
+
+    EGLConfig  ecfg;
+    EGLint     numConfig;
+
+    eglContext = EGL_NO_CONTEXT;
+    xDisplay = XOpenDisplay(NULL);
+    if (xDisplay == NULL) {
+      fprintf(stderr, "Failed to open DISPLAY.\n");
+      return false;
+    }
+    root = DefaultRootWindow(xDisplay);
+    swa.event_mask = ExposureMask | PointerMotionMask | KeyPressMask;
+
+    win = XCreateWindow(
+                    xDisplay, root, 0, 0, width, height, 0,
+                    CopyFromParent, InputOutput,
+                    CopyFromParent, CWEventMask,
+                    &swa);
+    xWindow = win;
+
+    ///////  the egl part  //////////////////////////////////////////////////////////////////
+    //  egl provides an interface to connect the graphics related functionality of openGL ES
+    //  with the windowing interface and functionality of the native operation system (X11
+    //  in our case.
+
+    eglDisplay  =  eglGetDisplay( (EGLNativeDisplayType) xDisplay );
+    if ( eglDisplay == EGL_NO_DISPLAY ) {
+            fprintf(stderr, "Got no EGL display.\n");
+            return false;
+    }
+    eglBindAPI(EGL_OPENGL_API);
+    int m,n;
+    if ( !eglInitialize( eglDisplay, &m, &n ) ) {
+      fprintf(stderr, "Unable to initialize EGL\n");
+      return false;
+    }
+    if ( !eglChooseConfig( eglDisplay, attr, &ecfg, 1, &numConfig ) ) {
+      fprintf(stderr, "Failed to choose config (eglError: %d)\n", eglGetError());
+      return false;
+    }
+    if ( numConfig != 1 ) {
+      fprintf(stderr, "Didn't get exactly one config, but %d", numConfig);
+      return false;
+    }
+    eglSurface = eglCreateWindowSurface ( eglDisplay, ecfg, win, NULL );
+    if ( eglSurface == EGL_NO_SURFACE ) {
+      fprintf(stderr, "Unable to create EGL surface (eglError: %d)\n", eglGetError());
+      return false;
+    }
+    eglContext = eglCreateContext ( eglDisplay, ecfg, EGL_NO_CONTEXT, ctxattr );
+    if ( eglContext == EGL_NO_CONTEXT ) {
+      fprintf(stderr, "Unable to create EGL context (eglError: %d)\n", eglGetError());
+      return false;
+    }
+    //// associate the egl-context with the egl-surface
+    eglMakeCurrent( eglDisplay, eglSurface, eglSurface, eglContext);
+
+    glClearColor(1.0, 1.0, 1.0, 1.0);
+    glClear(GL_COLOR_BUFFER_BIT);
+    glFinish();
+    eglSwapBuffers(eglDisplay, eglSurface);
+    return true;
+}
+#endif
+
+static const char*
+cl_test_channel_order_string(cl_channel_order order)
+{
+  switch(order) {
+#define DECL_ORDER(WHICH) case CL_##WHICH: return "CL_"#WHICH
+    DECL_ORDER(R);
+    DECL_ORDER(A);
+    DECL_ORDER(RG);
+    DECL_ORDER(RA);
+    DECL_ORDER(RGB);
+    DECL_ORDER(RGBA);
+    DECL_ORDER(BGRA);
+    DECL_ORDER(ARGB);
+    DECL_ORDER(INTENSITY);
+    DECL_ORDER(LUMINANCE);
+    DECL_ORDER(Rx);
+    DECL_ORDER(RGx);
+    DECL_ORDER(RGBx);
+#undef DECL_ORDER
+    default: return "Unsupported image channel order";
+  };
+}
+
+static const char*
+cl_test_channel_type_string(cl_channel_type type)
+{
+  switch(type) {
+#define DECL_TYPE(WHICH) case CL_##WHICH: return "CL_"#WHICH
+    DECL_TYPE(SNORM_INT8);
+    DECL_TYPE(SNORM_INT16);
+    DECL_TYPE(UNORM_INT8);
+    DECL_TYPE(UNORM_INT16);
+    DECL_TYPE(UNORM_SHORT_565);
+    DECL_TYPE(UNORM_SHORT_555);
+    DECL_TYPE(UNORM_INT_101010);
+    DECL_TYPE(SIGNED_INT8);
+    DECL_TYPE(SIGNED_INT16);
+    DECL_TYPE(SIGNED_INT32);
+    DECL_TYPE(UNSIGNED_INT8);
+    DECL_TYPE(UNSIGNED_INT16);
+    DECL_TYPE(UNSIGNED_INT32);
+    DECL_TYPE(HALF_FLOAT);
+    DECL_TYPE(FLOAT);
+#undef DECL_TYPE
+    default: return "Unsupported image channel type";
+  };
+}
+
+static void
+clpanic(const char *msg, int rval)
+{
+  printf("Failed: %s (%d)\n", msg, rval);
+  exit(-1);
+}
+
+char*
+cl_do_kiss_path(const char *file, cl_device_id device)
+{
+  cl_int ver;
+  const char *sub_path = NULL;
+  char *ker_path = NULL;
+  const char *kiss_path = getenv("OCL_KERNEL_PATH");
+  size_t sz = strlen(file);
+
+  if (device == NULL)
+    sub_path = "";
+  else {
+    if (clGetGenVersionIntel(device, &ver) != CL_SUCCESS)
+      clpanic("Unable to get Gen version", -1);
+    sub_path = "";
+  }
+
+  if (kiss_path == NULL)
+    clpanic("set OCL_KERNEL_PATH. This is where the kiss kernels are", -1);
+  sz += strlen(kiss_path) + strlen(sub_path) + 2; /* +1 for end of string, +1 for '/' */
+  if ((ker_path = (char*) malloc(sz)) == NULL)
+    clpanic("Allocation failed", -1);
+  sprintf(ker_path, "%s/%s%s", kiss_path, sub_path, file);
+  return ker_path;
+}
+
+int
+cl_kernel_init(const char *file_name, const char *kernel_name, int format, const char * build_opt)
+{
+  cl_file_map_t *fm = NULL;
+  char *ker_path = NULL;
+  cl_int status = CL_SUCCESS;
+  static const char *prevFileName = NULL;
+
+  /* Load the program and build it */
+  if (!program || (program && (!prevFileName || strcmp(prevFileName, file_name)))) {
+    if (program) clReleaseProgram(program);
+    ker_path = cl_do_kiss_path(file_name, device);
+    if (format == LLVM)
+      program = clCreateProgramWithLLVMIntel(ctx, 1, &device, ker_path, &status);
+    else if (format == SOURCE) {
+      cl_file_map_t *fm = cl_file_map_new();
+      FATAL_IF (cl_file_map_open(fm, ker_path) != CL_FILE_MAP_SUCCESS,
+                "Failed to open file \"%s\" with kernel \"%s\". Did you properly set OCL_KERNEL_PATH variable?",
+                file_name, kernel_name);
+      const char *src = cl_file_map_begin(fm);
+      const size_t sz = cl_file_map_size(fm);
+      program = clCreateProgramWithSource(ctx, 1, &src, &sz, &status);
+      cl_file_map_delete(fm);
+    } else
+      FATAL("Not able to create program from binary");
+
+    if (status != CL_SUCCESS) {
+      fprintf(stderr, "error calling clCreateProgramWithBinary\n");
+      goto error;
+    }
+    prevFileName = file_name;
+  }
+  /* OCL requires to build the program even if it is created from a binary */
+  OCL_CALL (clBuildProgram, program, 1, &device, build_opt, NULL, NULL);
+
+  /* Create a kernel from the program */
+  if (kernel)
+    clReleaseKernel(kernel);
+  kernel = clCreateKernel(program, kernel_name, &status);
+  if (status != CL_SUCCESS) {
+    fprintf(stderr, "error calling clCreateKernel\n");
+    goto error;
+  }
+
+exit:
+  free(ker_path);
+  cl_file_map_delete(fm);
+  return status;
+error:
+  prevFileName = NULL;
+  goto exit;
+}
+
+#define GET_PLATFORM_STR_INFO(LOWER_NAME, NAME) \
+  { \
+    size_t param_value_size; \
+    OCL_CALL (clGetPlatformInfo, platform, CL_PLATFORM_##NAME, 0, 0, &param_value_size); \
+    std::vector<char> param_value(param_value_size); \
+    OCL_CALL (clGetPlatformInfo, platform, CL_PLATFORM_##NAME, \
+              param_value_size, param_value.empty() ? NULL : &param_value.front(), \
+              &param_value_size); \
+    std::string str; \
+    if (!param_value.empty()) \
+      str = std::string(&param_value.front(), param_value_size-1); \
+    printf("platform_" #LOWER_NAME " \"%s\"\n", str.c_str()); \
+  }
+
+#include <cstring>
+#define GET_DEVICE_STR_INFO(LOWER_NAME, NAME) \
+    std::string LOWER_NAME ##Str; \
+    OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_##NAME, 0, 0, &param_value_size); \
+    { \
+      std::vector<char> param_value(param_value_size); \
+      OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_##NAME, \
+                param_value_size, param_value.empty() ? NULL : &param_value.front(), \
+                &param_value_size); \
+      if (!param_value.empty()) \
+        LOWER_NAME ##Str = std::string(&param_value.front(), param_value_size-1); \
+    } \
+    printf("device_" #LOWER_NAME " \"%s\"\n", LOWER_NAME ##Str.c_str());
+
+int
+cl_ocl_init(void)
+{
+  cl_int status = CL_SUCCESS;
+  cl_uint platform_n;
+  size_t i;
+#ifdef HAS_EGL
+  bool hasGLExt = false;
+#endif
+  cl_context_properties *props = NULL;
+
+  /* Get the platform number */
+  OCL_CALL (clGetPlatformIDs, 0, NULL, &platform_n);
+  printf("platform number %u\n", platform_n);
+  assert(platform_n >= 1);
+
+  /* Get a valid platform */
+  OCL_CALL (clGetPlatformIDs, 1, &platform, &platform_n);
+  GET_PLATFORM_STR_INFO(profile, PROFILE);
+  GET_PLATFORM_STR_INFO(name, NAME);
+  GET_PLATFORM_STR_INFO(vendor, VENDOR);
+  GET_PLATFORM_STR_INFO(version, VERSION);
+  GET_PLATFORM_STR_INFO(extensions, EXTENSIONS);
+
+  /* Get the device (only GPU device is supported right now) */
+  try {
+    OCL_CALL (clGetDeviceIDs, platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+    {
+      size_t param_value_size;
+      GET_DEVICE_STR_INFO(profile, PROFILE);
+      GET_DEVICE_STR_INFO(name, NAME);
+      GET_DEVICE_STR_INFO(vendor, VENDOR);
+      GET_DEVICE_STR_INFO(version, VERSION);
+      GET_DEVICE_STR_INFO(extensions, EXTENSIONS);
+      GET_DEVICE_STR_INFO(opencl_c_version, OPENCL_C_VERSION);
+#ifdef HAS_EGL
+      if (std::strstr(extensionsStr.c_str(), "cl_khr_gl_sharing")) {
+        hasGLExt = true;
+      }
+#endif
+    }
+  } catch (...) {
+     fprintf(stderr, "error calling clGetDeviceIDs\n");
+     status = CL_DEVICE_NOT_FOUND;
+     goto error;
+  }
+
+#ifdef HAS_EGL
+  if (hasGLExt) {
+    int i = 0;
+    props = new cl_context_properties[7];
+    props[i++] = CL_CONTEXT_PLATFORM;
+    props[i++] = (cl_context_properties)platform;
+    if (init_egl_window(EGL_WINDOW_WIDTH, EGL_WINDOW_HEIGHT)) {
+      props[i++] = CL_EGL_DISPLAY_KHR;
+      props[i++] = (cl_context_properties)eglGetCurrentDisplay();
+      props[i++] = CL_GL_CONTEXT_KHR;
+      props[i++] = (cl_context_properties)eglGetCurrentContext();
+    }
+    props[i++] = 0;
+  }
+#endif
+  /* Now create a context */
+  ctx = clCreateContext(props, 1, &device, NULL, NULL, &status);
+  if (status != CL_SUCCESS) {
+    fprintf(stderr, "error calling clCreateContext\n");
+    goto error;
+  }
+
+  /* All image types currently supported by the context */
+  cl_image_format fmt[256];
+  cl_uint fmt_n;
+  clGetSupportedImageFormats(ctx, 0, CL_MEM_OBJECT_IMAGE2D, 256, fmt, &fmt_n);
+  printf("%u image formats are supported\n", fmt_n);
+  for (i = 0; i < fmt_n; ++i)
+    printf("[%s %s]\n",
+        cl_test_channel_order_string(fmt[i].image_channel_order),
+        cl_test_channel_type_string(fmt[i].image_channel_data_type));
+
+  /* We are going to push NDRange kernels here */
+  queue = clCreateCommandQueue(ctx, device, 0, &status);
+  if (status != CL_SUCCESS) {
+    fprintf(stderr, "error calling clCreateCommandQueue\n");
+    goto error;
+  }
+
+error:
+  if (props)
+    delete props;
+  return status;
+}
+
+int
+cl_test_init(const char *file_name, const char *kernel_name, int format)
+{
+  cl_int status = CL_SUCCESS;
+
+  /* Initialize OCL */
+  if ((status = cl_ocl_init()) != CL_SUCCESS)
+    goto error;
+
+  /* Load the kernel */
+  if ((status = cl_kernel_init(file_name, kernel_name, format, NULL)) != CL_SUCCESS)
+    goto error;
+
+error:
+  return status;
+}
+
+void
+cl_kernel_destroy(bool needDestroyProgram)
+{
+  if (kernel) {
+    clReleaseKernel(kernel);
+    kernel = NULL;
+  }
+  if (needDestroyProgram && program) {
+    clReleaseProgram(program);
+    program = NULL;
+  }
+}
+
+void
+cl_ocl_destroy(void)
+{
+  clReleaseCommandQueue(queue);
+  clReleaseContext(ctx);
+#ifdef HAS_EGL
+  if (eglContext != NULL) {
+    cl_ocl_destroy_egl_window();
+    eglContext = NULL;
+  }
+#endif
+}
+
+void
+cl_test_destroy(void)
+{
+  cl_kernel_destroy();
+  cl_ocl_destroy();
+  printf("%i memory leaks\n", clReportUnfreedIntel());
+  assert(clReportUnfreedIntel() == 0);
+}
+
+void
+cl_buffer_destroy(void)
+{
+  int i;
+  for (i = 0; i < MAX_BUFFER_N; ++i) {
+    if (buf_data[i] != NULL) {
+      clUnmapBufferIntel(buf[i]);
+      buf_data[i] = NULL;
+    }
+    if (buf[i] != NULL) {
+      clReleaseMemObject(buf[i]);
+      buf[i] = NULL;
+    }
+  }
+}
+
+void
+cl_report_perf_counters(cl_mem perf)
+{
+  cl_int status = CL_SUCCESS;
+  uint32_t *start = NULL, *end = NULL;
+  uint32_t i;
+  if (perf == NULL)
+    return;
+  start = (uint32_t*) clMapBufferIntel(perf, &status);
+  assert(status == CL_SUCCESS && start != NULL);
+  end = start + 128;
+
+  printf("BEFORE\n");
+  for (i = 0; i < 6*8; ++i) {
+    if (i % 8 == 0) printf("\n");
+    printf("[%3u 0x%8x] ", i, start[i]);
+  }
+  printf("\n\n");
+
+  printf("AFTER\n");
+  for (i = 0; i < 6*8; ++i) {
+    if (i % 8 == 0) printf("\n");
+    printf("[%3u 0x%8x] ", i, end[i]);
+  }
+  printf("\n\n");
+
+  printf("DIFF\n");
+  for (i = 0; i < 6*8; ++i) {
+    if (i % 8 == 0) printf("\n");
+    printf("[%3u %8i] ", i, end[i] - start[i]);
+  }
+  printf("\n\n");
+
+  clUnmapBufferIntel(perf);
+}
+
+struct bmphdr {
+  //   2 bytes of magic here, "BM", total header size is 54 bytes!
+  int filesize;		//   4 total file size incl header
+  short as0, as1;		//   8 app specific
+  int bmpoffset;		//  12 ofset of bmp data 
+  int headerbytes;	//  16 bytes in header from this point (40 actually)
+  int width;		//  20 
+  int height;		//  24 
+  short nplanes;		//  26 no of color planes
+  short bpp;		//  28 bits/pixel
+  int compression;	//  32 BI_RGB = 0 = no compression
+  int sizeraw;		//  36 size of raw bmp file, excluding header, incl padding
+  int hres;		//  40 horz resolutions pixels/meter
+  int vres;		//  44
+  int npalcolors;		//  48 No of colors in palette
+  int nimportant;		//  52 No of important colors
+  // raw b, g, r data here, dword aligned per scan line
+};
+
+int *cl_read_bmp(const char *filename, int *width, int *height)
+{
+  struct bmphdr hdr;
+  char *bmppath = cl_do_kiss_path(filename, device);
+  FILE *fp = fopen(bmppath, "rb");
+  assert(fp);
+
+  char magic[2];
+  int ret;
+  ret = fread(&magic[0], 1, 2, fp);
+  ret = ret;
+  assert(2 == ret);
+  assert(magic[0] == 'B' && magic[1] == 'M');
+
+  ret = fread(&hdr, sizeof(hdr), 1, fp);
+  assert(1 == ret);
+
+  assert(hdr.width > 0 && hdr.height > 0 && hdr.nplanes == 1 && hdr.compression == 0);
+
+  int *rgb32 = (int *) malloc(hdr.width * hdr.height * sizeof(int));
+  assert(rgb32);
+  int x, y;
+
+  int *dst = rgb32;
+  for (y = 0; y < hdr.height; y++) {
+    for (x = 0; x < hdr.width; x++) {
+      assert(!feof(fp));
+      int b = (getc(fp) & 0x0ff);
+      int g = (getc(fp) & 0x0ff);
+      int r = (getc(fp) & 0x0ff);
+      *dst++ = (r | (g << 8) | (b << 16) | 0xff000000);	/* abgr */
+    }
+    while (x & 3) {
+      getc(fp);
+      x++;
+    }		// each scanline padded to dword
+    // printf("read row %d\n", y);
+    // fflush(stdout);
+  }
+  fclose(fp);
+  *width = hdr.width;
+  *height = hdr.height;
+  free(bmppath);
+  return rgb32;
+}
+
+void cl_write_bmp(const int *data, int width, int height, const char *filename)
+{
+  int x, y;
+
+  FILE *fp = fopen(filename, "wb");
+  assert(fp);
+
+  char *raw = (char *) malloc(width * height * sizeof(int));	// at most
+  assert(raw);
+  char *p = raw;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      int c = *data++;
+      *p++ = ((c >> 16) & 0xff);
+      *p++ = ((c >> 8) & 0xff);
+      *p++ = ((c >> 0) & 0xff);
+    }
+    while (x & 3) {
+      *p++ = 0;
+      x++;
+    } // pad to dword
+  }
+  int sizeraw = p - raw;
+  int scanline = (width * 3 + 3) & ~3;
+  assert(sizeraw == scanline * height);
+
+  struct bmphdr hdr;
+
+  hdr.filesize = scanline * height + sizeof(hdr) + 2;
+  hdr.as0 = 0;
+  hdr.as1 = 0;
+  hdr.bmpoffset = sizeof(hdr) + 2;
+  hdr.headerbytes = 40;
+  hdr.width = width;
+  hdr.height = height;
+  hdr.nplanes = 1;
+  hdr.bpp = 24;
+  hdr.compression = 0;
+  hdr.sizeraw = sizeraw;
+  hdr.hres = 0;		// 2834;
+  hdr.vres = 0;		// 2834;
+  hdr.npalcolors = 0;
+  hdr.nimportant = 0;
+
+  /* Now write bmp file */
+  char magic[2] = { 'B', 'M' };
+  fwrite(&magic[0], 1, 2, fp);
+  fwrite(&hdr, 1, sizeof(hdr), fp);
+  fwrite(raw, 1, hdr.sizeraw, fp);
+
+  fclose(fp);
+  free(raw);
+}
+
+static const float pixel_threshold = 0.05f;
+static const float max_error_ratio = 0.001f;
+
+int cl_check_image(const int *img, int w, int h, const char *bmp)
+{
+  int refw, refh;
+  int *ref = cl_read_bmp(bmp, &refw, &refh);
+  if (ref == NULL || refw != w || refh != h) return 0;
+  const int n = w*h;
+  int discrepancy = 0;
+  for (int i = 0; i < n; ++i) {
+    const float r = (float) (img[i] & 0xff);
+    const float g = (float) ((img[i] >> 8) & 0xff);
+    const float b = (float) ((img[i] >> 16) & 0xff);
+    const float rr = (float) (ref[i] & 0xff);
+    const float rg = (float) ((ref[i] >> 8) & 0xff);
+    const float rb = (float) ((ref[i] >> 16) & 0xff);
+    const float dr = fabs(r-rr) / (1.f/255.f + std::max(r,rr));
+    const float dg = fabs(g-rg) / (1.f/255.f + std::max(g,rg));
+    const float db = fabs(b-rb) / (1.f/255.f + std::max(b,rb));
+    const float err = sqrtf(dr*dr+dg*dg+db*db);
+    if (err > pixel_threshold) discrepancy++;
+  }
+  free(ref);
+  return (float(discrepancy) / float(n) > max_error_ratio) ? 0 : 1;
+}
+
+const float cl_FLT_ULP(float float_number)
+{
+  SF floatBin, ulpBin, ulpBinBase;
+  floatBin.f = float_number;
+
+  ulpBin.spliter.sign     = ulpBinBase.spliter.sign     = 0;
+  ulpBin.spliter.exponent = ulpBinBase.spliter.exponent = floatBin.spliter.exponent;
+  ulpBin.spliter.mantissa = 0x1;
+  ulpBinBase.spliter.mantissa = 0x0;
+  
+  return ulpBin.f - ulpBinBase.f;
+}
+
+const int cl_INT_ULP(int int_number)
+{
+  return 0;
+}
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
new file mode 100644
index 0000000..de4d277
--- /dev/null
+++ b/utests/utest_helper.hpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file utest_helper.hpp
+ *
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+#ifndef __UTEST_HELPER_HPP__
+#define __UTEST_HELPER_HPP__
+
+#include "CL/cl.h"
+#include "CL/cl_intel.h"
+#include "utest.hpp"
+#include "utest_assert.hpp"
+#include "utest_error.h"
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+
+#ifdef HAS_EGL
+#define EGL_WINDOW_WIDTH 256
+#define EGL_WINDOW_HEIGHT 256
+#include  <GL/gl.h>
+#include  <EGL/egl.h>
+#include  <EGL/eglext.h>
+#include <CL/cl_gl.h>
+
+extern EGLDisplay  eglDisplay;
+extern EGLContext  eglContext;
+extern EGLSurface  eglSurface;
+#endif
+
+#define OCL_THROW_ERROR(FN, STATUS) \
+  do { \
+    char msg[2048]; \
+    sprintf(msg, "error calling %s with error %s \n", #FN, err_msg[-STATUS]); \
+    OCL_ASSERTM(false, msg); \
+  } while (0)
+
+#define OCL_CALL(FN, ...) \
+  do { \
+    int status = FN(__VA_ARGS__); \
+    if (status != CL_SUCCESS) OCL_THROW_ERROR(FN, status); \
+  } while (0)
+
+#define OCL_CREATE_KERNEL(NAME) \
+  do { \
+    OCL_CALL (cl_kernel_init, NAME".cl", NAME, SOURCE, NULL); \
+  } while (0)
+
+#define OCL_DESTROY_KERNEL_KEEP_PROGRAM(KEEP_PROGRAM) \
+  do { \
+    cl_kernel_destroy(!(KEEP_PROGRAM)); \
+  } while(0)
+
+#define OCL_CREATE_KERNEL_FROM_FILE(FILE_NAME, KERNEL_NAME) \
+  do { \
+    OCL_CALL(cl_kernel_init, FILE_NAME".cl", KERNEL_NAME, SOURCE, NULL); \
+  } while (0)
+
+#define OCL_FLUSH() \
+  do { \
+    OCL_CALL(clFlush, queue); \
+  } while(0)
+
+#define OCL_FINISH() \
+  do { \
+    OCL_CALL(clFinish, queue); \
+  } while(0)
+
+#define OCL_CALL2(FN, RET, ...) \
+  do { \
+    cl_int status; \
+    RET = FN(__VA_ARGS__, &status);\
+    if (status != CL_SUCCESS) OCL_THROW_ERROR(FN, status); \
+  } while (0)
+
+#define OCL_CREATE_BUFFER(BUFFER, FLAGS, SIZE, DATA) \
+        OCL_CALL2(clCreateBuffer, BUFFER, ctx, FLAGS, SIZE, DATA)
+
+#define OCL_CREATE_USER_EVENT(EVENT) \
+    OCL_CALL2(clCreateUserEvent, EVENT, ctx)
+
+#define OCL_SET_USER_EVENT_STATUS(EVENT, STATUS) \
+    OCL_CALL(clSetUserEventStatus, EVENT, STATUS)
+
+#define OCL_CREATE_IMAGE(IMAGE, FLAGS, FORMAT, DESC, DATA) \
+    OCL_CALL2(clCreateImage, IMAGE, ctx, FLAGS, FORMAT, DESC, DATA)
+
+#define OCL_READ_IMAGE(IMAGE, ORIGIN, REGION, DATA) \
+    OCL_CALL(clEnqueueReadImage, queue, IMAGE, CL_TRUE, ORIGIN, REGION, 0, 0, DATA, 0, NULL, NULL)
+
+#define OCL_WRITE_IMAGE(IMAGE, ORIGIN, REGION, DATA) \
+    OCL_CALL(clEnqueueWriteImage, queue, IMAGE, CL_TRUE, ORIGIN, REGION, 0, 0, DATA, 0, NULL, NULL)
+
+#define OCL_CREATE_GL_IMAGE(IMAGE, FLAGS, TARGET, LEVEL, TEXTURE) \
+    OCL_CALL2(clCreateFromGLTexture, IMAGE, ctx, FLAGS, TARGET, LEVEL, TEXTURE)
+
+#define OCL_ENQUEUE_ACQUIRE_GL_OBJECTS(ID) \
+    OCL_CALL(clEnqueueAcquireGLObjects, queue, 1, &buf[ID], 0, 0, 0)
+
+#define OCL_SWAP_EGL_BUFFERS() \
+  eglSwapBuffers(eglDisplay, eglSurface);
+
+#define OCL_CREATE_SAMPLER(SAMPLER, ADDRESS_MODE, FILTER_MODE)          \
+    OCL_CALL2(clCreateSampler, SAMPLER, ctx, 0, ADDRESS_MODE, FILTER_MODE)
+
+#define OCL_MAP_BUFFER(ID) \
+    OCL_CALL2(clMapBufferIntel, buf_data[ID], buf[ID])
+
+#define OCL_UNMAP_BUFFER(ID) \
+  do { \
+    if (buf[ID] != NULL) { \
+      OCL_CALL (clUnmapBufferIntel, buf[ID]); \
+      buf_data[ID] = NULL; \
+    } \
+  } while (0)
+
+#define OCL_MAP_BUFFER_GTT(ID) \
+    OCL_CALL2(clMapBufferGTTIntel, buf_data[ID], buf[ID])
+
+#define OCL_UNMAP_BUFFER_GTT(ID) \
+  do { \
+    if (buf[ID] != NULL) { \
+      OCL_CALL (clUnmapBufferGTTIntel, buf[ID]); \
+      buf_data[ID] = NULL; \
+    } \
+  } while (0)
+
+#define OCL_NDRANGE(DIM_N) \
+    OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, DIM_N, NULL, globals, locals, 0, NULL, NULL)
+
+#define OCL_SET_ARG(ID, SIZE, ARG) \
+    OCL_CALL (clSetKernelArg, kernel, ID, SIZE, ARG)
+
+#define OCL_CHECK_IMAGE(DATA, W, H, FILENAME) \
+  if (cl_check_image(DATA, W, H, FILENAME) == 0) \
+    OCL_ASSERTM(false, "image mismatch")
+
+enum { MAX_BUFFER_N = 16 };
+extern cl_platform_id platform;
+extern cl_device_id device;
+extern cl_context ctx;
+extern cl_program program;
+extern cl_kernel kernel;
+extern cl_command_queue queue;
+extern cl_mem buf[MAX_BUFFER_N];
+extern void* buf_data[MAX_BUFFER_N];
+extern size_t globals[3];
+extern size_t locals[3];
+
+enum {
+  SOURCE = 0,
+  LLVM = 1,
+  BIN = 2
+};
+
+/* The SF is float type spliter*/
+typedef struct
+{
+  unsigned int mantissa:23;
+  unsigned int exponent:8;
+  unsigned int sign:1;
+} FLOAT;
+
+typedef union
+{
+  float f;
+  unsigned int i;
+  FLOAT spliter;
+} SF;
+
+/* Init OpenCL */
+extern int cl_ocl_init(void);
+
+/* Init program and kernel for the test */
+extern int cl_kernel_init(const char *file_name,
+                const char *kernel_name, int format, const char * build_opt);
+
+/* Get the file path */
+extern char* cl_do_kiss_path(const char *file, cl_device_id device);
+
+/* init the bunch of global varaibles here */
+extern int cl_test_init(const char *file_name, const char *kernel_name, int format);
+
+/* Unmap and release all the created buffers */
+extern void cl_buffer_destroy(void);
+
+/* Release OCL queue, context and device */
+extern void cl_ocl_destroy(void);
+
+/* Release kernel and program */
+extern void cl_kernel_destroy(bool needDestroyProgram = true);
+
+/* Release everything allocated in cl_test_init */
+extern void cl_test_destroy(void);
+
+/* Nicely output the performance counters */
+extern void cl_report_perf_counters(cl_mem perf);
+
+/* Read a bmp from file */
+extern int *cl_read_bmp(const char *filename, int *width, int *height);
+
+/* Write a bmp to a file */
+extern void cl_write_bmp(const int *data, int width, int height, const char *filename);
+
+/* Check data from img against bmp file located at "bmp" */
+extern int cl_check_image(const int *img, int w, int h, const char *bmp);
+
+/* Calculator ULP of each FLOAT value */
+extern const float cl_FLT_ULP(float float_number);
+
+/* Calculator ULP of each INT value */
+extern const int cl_INT_ULP(int int_number);
+
+#endif /* __UTEST_HELPER_HPP__ */
+
diff --git a/utests/utest_math_gen.py b/utests/utest_math_gen.py
new file mode 100755
index 0000000..30a9b24
--- /dev/null
+++ b/utests/utest_math_gen.py
@@ -0,0 +1,577 @@
+#!/usr/bin/python
+from utest_generator import *
+import os,sys
+
+#base_input_values = [80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24]
+#extend_input_values = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24]
+
+#func:
+#    gpufuncName 
+#    cpuFuncName
+#    fileName: 'builtin_'+name
+#    inputtype: a 2-D list because there're more than one input data
+#    outputtype: a list
+#    values
+#    ulp
+
+# reduce pi*x limitation to [-pi,pi]
+reduce1='''
+static float reduce1( float x )
+{
+  SF fx, fy;
+  fx.f = fy.f = x;
+  int n;
+
+  fy.spliter.exponent = fx.spliter.exponent - 1;
+  n = (int)fy.f;
+
+  fx.f = fx.f - 2.0 * n;
+
+  // reduce to [-1.0, 1.0]
+  fx.f = (fx.f < -1)?(fx.f + 2.0):((fx.f > 1)?(fx.f - 2.0):fx.f);
+
+  return fx.f;
+}
+'''
+# define fuction: cospi
+cospi='''
+static float cospi(float x){
+  float r = x;
+  if ( x > 1 || x < -1) r = reduce1(x);
+
+  // reduce to [0.0, 1.0]
+  if (r < 0)
+    r = fabs(r);
+
+  if (r >= 0 && r <= 0.25)
+    return  cosf(r * M_PI);
+  else if (r > 0.25 && r <= 0.5)
+    return  sinf((0.5 - r) * M_PI);
+  else if (r > 0.5 && r <= 0.75)
+    return sinf(-(r-0.5) * M_PI);
+  else if (r > 0.75 && r <= 1.0){
+    return -cosf((1 -  r) * M_PI);}
+
+  // Error return
+  return 0xffffffff;
+}
+'''
+# define function: sinpi
+sinpi='''
+static float sinpi(float x){
+  float r = x;
+  if ( x > 1 || x < -1) r = reduce1(x);
+
+  // reduce to [-0.5, 0.5]
+  if (r < -0.5)
+    r = -1 - r;
+  else if (r > 0.5)
+    r = 1 - r;
+
+  if (r > 0.25 && r <= 0.5)
+    return  cosf((0.5 - r) * M_PI);
+  else if (r >= 0 && r <= 0.25)
+    return  sinf(r * M_PI);
+  else if (r >= -0.25 && r < 0)
+    return -sinf(r * -M_PI);
+  else if (r >= -0.5 && r < -0.25){
+    return -cosf((0.5 + r) * M_PI);}
+
+  // Error return
+  return 0xffffffff;
+}
+'''
+
+base_input_values = [ 0, 1, 3.14, 5.15, 6.01, 7.89]
+base_input_values1 = [ 1, 3.14, 5.15, 6.01, 7.89]
+def main():
+  ##### gentype acos(gentype)
+  acos_input_values = base_input_values
+  acos_input_type = ['float','float2','float4','float8','float16']
+  acos_output_type = ['float','float2','float4','float8','float16']
+  acosUtests = func('acos','acos',[acos_input_type],acos_output_type,[acos_input_values],'4 * FLT_ULP')
+  
+  ##### gentype acosh(gentype)
+  acosh_input_values = base_input_values
+  acosh_input_type = ['float','float2','float4','float8','float16']
+  acosh_output_type = ['float','float2','float4','float8','float16']
+  acoshUtests = func('acosh','acosh',[acosh_input_type],acosh_output_type,[acosh_input_values],'4 * FLT_ULP')
+  
+  ##### gentype acospi(gentype x)
+  acospi_input_values = base_input_values
+  acospi_input_type = ['float','float2','float4','float8','float16']
+  acospi_output_type = ['float','float2','float4','float8','float16']
+  acospi_cpu_func='''
+static float acospi(float x){
+  return acos(x)/M_PI;
+} '''
+  acospiUtests = func('acospi','acospi',[acospi_input_type],acospi_output_type,[acospi_input_values],'4 * FLT_ULP',acospi_cpu_func)
+  
+  ##### gentype asin(gentype)
+  asin_input_values = base_input_values
+  asin_input_type = ['float','float2','float4','float8','float16']
+  asin_output_type = ['float','float2','float4','float8','float16']
+  asinUtests = func('asin','asin',[asin_input_type],asin_output_type,[asin_input_values],'4 * FLT_ULP')
+  
+  ##### gentype asinh(gentype)
+  asinh_input_values = base_input_values
+  asinh_input_type = ['float','float2','float4','float8','float16']
+  asinh_output_type = ['float','float2','float4','float8','float16']
+  asinhUtests = func('asinh','asinh',[asinh_input_type],asinh_output_type,[asinh_input_values],'4 * FLT_ULP')
+  
+  ##### gentype asinpi(gentype x)
+  asinpi_input_values = base_input_values
+  asinpi_input_type = ['float','float2','float4','float8','float16']
+  asinpi_output_type = ['float','float2','float4','float8','float16']
+  asinpi_cpu_func='''
+static float asinpi(float x){
+  return asin(x)/M_PI;
+} '''
+  asinpiUtests = func('asinpi','asinpi',[asinpi_input_type],asinpi_output_type,[asinpi_input_values],'4 * FLT_ULP',asinpi_cpu_func)
+  
+  ##### gentype atan(gentype y_over_x)
+  atan_input_values = base_input_values
+  atan_input_type = ['float','float2','float4','float8','float16']
+  atan_output_type = ['float','float2','float4','float8','float16']
+  atanUtests = func('atan','atan',[atan_input_type],atan_output_type,[atan_input_values],'5 * FLT_ULP')
+  
+  ##### gentype atan2(gentype y, gentype x)
+  atan2_base_values = base_input_values
+  atan2_input_values1 = []
+  atan2_input_values2 = []
+  atan2_input_values1,atan2_input_values2=gene2ValuesLoop(atan2_input_values1,atan2_input_values2,atan2_base_values)
+  atan2_input_type1 = ['float','float2','float4','float8','float16']
+  atan2_input_type2 = ['float','float2','float4','float8','float16']
+  atan2_output_type = ['float','float2','float4','float8','float16']
+  atan2Utests = func('atan2','atan2',[atan2_input_type1,atan2_input_type2],atan2_output_type,[atan2_input_values1,atan2_input_values2],'6 * FLT_ULP')
+  
+  ##### gentype atanh(gentype)
+  atanh_input_values = base_input_values
+  atanh_input_type = ['float','float2','float4','float8','float16']
+  atanh_output_type = ['float','float2','float4','float8','float16']
+  atanhUtests = func('atanh','atanh',[atanh_input_type],atanh_output_type,[atanh_input_values],'5 * FLT_ULP')
+  
+  ##### gentype atanpi(gentype x)
+  atanpi_input_values = base_input_values
+  atanpi_input_type = ['float','float2','float4','float8','float16']
+  atanpi_output_type = ['float','float2','float4','float8','float16']
+  atanpi_cpu_func='''
+static float atanpi(float x){
+  return atan(x)/M_PI;
+} '''
+  atanpiUtests = func('atanpi','atanpi',[atanpi_input_type],atanpi_output_type,[atanpi_input_values],'4 * FLT_ULP',atanpi_cpu_func)
+  
+#  ##### gentype atan2pi(gentype y, gentype x)
+#  atan2pi_base_values = base_input_values
+#  atan2pi_input_values1 = []
+#  atan2pi_input_values2 = []
+#  atan2pi_input_values1,atan2pi_input_values2=gene2ValuesLoop(atan2pi_input_values1,atan2pi_input_values2,atan2pi_base_values)
+#  atan2pi_input_type1 = ['float','float2','float4','float8','float16']
+#  atan2pi_input_type2 = ['float','float2','float4','float8','float16']
+#  atan2pi_output_type = ['float','float2','float4','float8','float16']
+#  atan2pi_cpu_func='''
+#static float atan2pi(float y, float x){
+#  return atan2(y,x)/M_PI;
+#} '''
+#  atan2piUtests = func('atan2pi','atan2pi',[atan2pi_input_type1,atan2pi_input_type2],atan2pi_output_type,[atan2pi_input_values1,atan2pi_input_values2],'6 * FLT_ULP',atan2pi_cpu_func)
+  
+  ##### gentype cbrt(gentype)
+  cbrt_input_values = base_input_values
+  cbrt_input_type = ['float','float2','float4','float8','float16']
+  cbrt_output_type = ['float','float2','float4','float8','float16']
+  cbrtUtests = func('cbrt','cbrt',[cbrt_input_type],cbrt_output_type,[cbrt_input_values],'4 * FLT_ULP')
+  
+  ##### gentype ceil(gentype)
+  ceil_input_values = base_input_values
+  ceil_input_type = ['float','float2','float4','float8','float16']
+  ceil_output_type = ['float','float2','float4','float8','float16']
+  ceilUtests = func('ceil','ceil',[ceil_input_type],ceil_output_type,[ceil_input_values],'0 * FLT_ULP')
+  
+  ##### gentype copysign(gentype x, gentype y)
+  copysign_base_values = base_input_values
+  copysign_input_values1 = []
+  copysign_input_values2 = []
+  copysign_input_values1,copysign_input_values2=gene2ValuesLoop(copysign_input_values1,copysign_input_values2,copysign_base_values)
+  copysign_input_type1 = ['float','float2','float4','float8','float16']
+  copysign_input_type2 = ['float','float2','float4','float8','float16']
+  copysign_output_type = ['float','float2','float4','float8','float16']
+  copysignUtests = func('copysign','copysign',[copysign_input_type1,copysign_input_type2],copysign_output_type,[copysign_input_values1,copysign_input_values2],'0 * FLT_ULP')
+  
+  ##### gentype cos(gentype)
+  cos_input_values = base_input_values
+  cos_input_type = ['float','float2','float4','float8','float16']
+  cos_output_type = ['float','float2','float4','float8','float16']
+  cosUtests = func('cos','cos',[cos_input_type],cos_output_type,[cos_input_values],'4 * FLT_ULP')
+  
+  ##### gentype cosh(gentype)
+  cosh_input_values = base_input_values
+  cosh_input_type = ['float','float2','float4','float8','float16']
+  cosh_output_type = ['float','float2','float4','float8','float16']
+  coshUtests = func('cosh','cosh',[cosh_input_type],cosh_output_type,[cosh_input_values],'4 * FLT_ULP')
+  
+  ##### gentype cospi(gentype x)
+  cospi_input_values = base_input_values
+  cospi_input_type = ['float','float2','float4','float8','float16']
+  cospi_output_type = ['float','float2','float4','float8','float16']
+  cospi_cpu_func=reduce1+cospi
+  cospiUtests = func('cospi','cospi',[cospi_input_type],cospi_output_type,[cospi_input_values],'2 * FLT_ULP',cospi_cpu_func)
+  
+#  ##### gentype erf(gentype)
+#  erf_input_values = base_input_values
+#  erf_input_type = ['float','float2','float4','float8','float16']
+#  erf_output_type = ['float','float2','float4','float8','float16']
+#  erfUtests = func('erf','erf',[erf_input_type],erf_output_type,[erf_input_values],'16 * FLT_ULP')
+
+#  ##### gentype erfc(gentype)
+#  erfc_input_values = base_input_values
+#  erfc_input_type = ['float','float2','float4','float8','float16']
+#  erfc_output_type = ['float','float2','float4','float8','float16']
+#  erfcUtests = func('erfc','erfc',[erfc_input_type],erfc_output_type,[erfc_input_values],'16 * FLT_ULP')
+  
+  ##### gentype exp(gentype x)
+  exp_input_values = base_input_values
+  exp_input_type = ['float','float2','float4','float8','float16']
+  exp_output_type = ['float','float2','float4','float8','float16']
+  expUtests = func('exp','exp',[exp_input_type],exp_output_type,[exp_input_values],'4 * FLT_ULP')
+  
+  ##### gentype exp2(gentype)
+  exp2_input_values = base_input_values
+  exp2_input_type = ['float','float2','float4','float8','float16']
+  exp2_output_type = ['float','float2','float4','float8','float16']
+  exp2Utests = func('exp2','exp2',[exp2_input_type],exp2_output_type,[exp2_input_values],'4 * FLT_ULP')
+  
+  ##### gentype exp10(gentype)
+  exp10_input_values = base_input_values
+  exp10_input_type = ['float','float2','float4','float8','float16']
+  exp10_output_type = ['float','float2','float4','float8','float16']
+  exp10Utests = func('exp10','exp10',[exp10_input_type],exp10_output_type,[exp10_input_values],'4 * FLT_ULP')
+  
+  ##### gentype expm1(gentype x)
+  expm1_input_values = base_input_values
+  expm1_input_type = ['float','float2','float4','float8','float16']
+  expm1_output_type = ['float','float2','float4','float8','float16']
+  expm1Utests = func('expm1','expm1',[expm1_input_type],expm1_output_type,[expm1_input_values],'4 * FLT_ULP')
+  
+  ##### gentype fabs(gentype)
+  fabs_input_values = base_input_values
+  fabs_input_type = ['float','float2','float4','float8','float16']
+  fabs_output_type = ['float','float2','float4','float8','float16']
+  fabsUtests = func('fabs','fabs',[fabs_input_type],fabs_output_type,[fabs_input_values],'0 * FLT_ULP')
+  
+  ##### gentype fdim(gentype x, gentype y)
+  fdim_base_values = base_input_values
+  fdim_input_values1 = []
+  fdim_input_values2 = []
+  fdim_input_values1,fdim_input_values2=gene2ValuesLoop(fdim_input_values1,fdim_input_values2,fdim_base_values)
+  fdim_input_type1 = ['float','float2','float4','float8','float16']
+  fdim_input_type2 = ['float','float2','float4','float8','float16']
+  fdim_output_type = ['float','float2','float4','float8','float16']
+  fdimUtests = func('fdim','fdim',[fdim_input_type1,fdim_input_type2],fdim_output_type,[fdim_input_values1,fdim_input_values2],'0 * FLT_ULP')
+  
+  ##### gentype floor(gentype)
+  floor_input_values = base_input_values
+  floor_input_type = ['float','float2','float4','float8','float16']
+  floor_output_type = ['float','float2','float4','float8','float16']
+  floorUtests = func('floor','floor',[floor_input_type],floor_output_type,[floor_input_values],'0 * FLT_ULP')
+  
+  ##### gentype fmax(gentype x, gentype y)
+  fmax_base_values = base_input_values
+  fmax_input_values1 = []
+  fmax_input_values2 = []
+  fmax_input_values1,fmax_input_values2=gene2ValuesLoop(fmax_input_values1,fmax_input_values2,fmax_base_values)
+  fmax_input_type1 = ['float','float2','float4','float8','float16']
+  fmax_input_type2 = ['float','float2','float4','float8','float16']
+  fmax_output_type = ['float','float2','float4','float8','float16']
+  fmaxUtests = func('fmax','fmax',[fmax_input_type1,fmax_input_type2],fmax_output_type,[fmax_input_values1,fmax_input_values2],'0 * FLT_ULP')
+  
+  ##### gentypef fmax(gentypef x, float y)
+#  fmax_gentypef_base_values = base_input_values
+#  fmax_gentypef_input_values1 = []
+#  fmax_gentypef_input_values2 = []
+#  fmax_gentypef_input_values2,fmax_gentypef_input_values1=gene2ValuesLoop(fmax_gentypef_input_values1,fmax_gentypef_input_values2,fmax_gentypef_base_values)
+#  fmax_gentypef_input_type1 = ['float','float2','float4','float8','float16']
+#  fmax_gentypef_input_type2 = ['float','float','float','float','float']
+#  fmax_gentypef_output_type = ['float','float2','float4','float8','float16']
+#  ##### gentypef fmax(gentypef x, float y)
+#  fmax_gentypefUtests = func('gentypef_fmax','gentypef_fmax',[fmax_gentypef_input_type1,fmax_gentypef_input_type2],fmax_gentypef_output_type,[fmax_gentypef_input_values1,fmax_gentypef_input_values2],'0 * FLT_ULP')
+  
+  ##### gentype fmin(gentype x, gentype y)
+  fmin_base_values = base_input_values
+  fmin_input_values1 = []
+  fmin_input_values2 = []
+  fmin_input_values1,fmin_input_values2=gene2ValuesLoop(fmin_input_values1,fmin_input_values2,fmin_base_values)
+  fmin_input_type1 = ['float','float2','float4','float8','float16']
+  fmin_input_type2 = ['float','float2','float4','float8','float16']
+  fmin_output_type = ['float','float2','float4','float8','float16']
+  fminUtests = func('fmin','fmin',[fmin_input_type1,fmin_input_type2],fmin_output_type,[fmin_input_values1,fmin_input_values2],'0 * FLT_ULP')
+  
+#  ##### gentypef fmin(gentypef x, float y)
+#  fmin_gentypef_base_values = base_input_values
+#  fmin_gentypef_input_values1 = []
+#  fmin_gentypef_input_values2 = []
+#  fmin_gentypef_input_values2,fmin_gentypef_input_values1=gene2ValuesLoop(fmin_gentypef_input_values1,fmin_gentypef_input_values2,fmin_gentypef_base_values)
+#  fmin_gentypef_input_type1 = ['float','float2','float4','float8','float16']
+#  fmin_gentypef_input_type2 = ['float','float','float','float','float']
+#  fmin_gentypef_output_type = ['float','float2','float4','float8','float16']
+#  ##### gentypef fmin(gentypef x, float y)
+#  fmin_gentypefUtests = func('gentypef_fmin','gentypef_fmin',[fmin_gentypef_input_type1,fmin_gentypef_input_type2],fmin_gentypef_output_type,[fmin_gentypef_input_values1,fmin_gentypef_input_values2],'0 * FLT_ULP')
+#  
+  ##### gentype fmod(gentype x, gentype y)
+  fmod_base_values = base_input_values
+  fmod_input_values1 = []
+  fmod_input_values2 = []
+  fmod_input_values1,fmod_input_values2=gene2ValuesLoop(fmod_input_values1,fmod_input_values2,fmod_base_values)
+  fmod_input_type1 = ['float','float2','float4','float8','float16']
+  fmod_input_type2 = ['float','float2','float4','float8','float16']
+  fmod_output_type = ['float','float2','float4','float8','float16']
+  fmodUtests = func('fmod','fmod',[fmod_input_type1,fmod_input_type2],fmod_output_type,[fmod_input_values1,fmod_input_values2],'0 * FLT_ULP')
+  
+  ##### gentype hypot(gentype x, gentype y)
+  hypot_base_values = base_input_values
+  hypot_input_values1 = []
+  hypot_input_values2 = []
+  hypot_input_values1,hypot_input_values2=gene2ValuesLoop(hypot_input_values1,hypot_input_values2,hypot_base_values)
+  hypot_input_type1 = ['float','float2','float4','float8','float16']
+  hypot_input_type2 = ['float','float2','float4','float8','float16']
+  hypot_output_type = ['float','float2','float4','float8','float16']
+  hypotUtests = func('hypot','hypot',[hypot_input_type1,hypot_input_type2],hypot_output_type,[hypot_input_values1,hypot_input_values2],'4 * FLT_ULP')
+  
+  ##### intn ilogb(floartn x)
+  ilogb_input_values = base_input_values
+  ilogb_input_type = ['float','float2','float4','float8','float16']
+  ilogb_output_type = ['int','int2','int4','int8','int16']
+  ilogbUtests = func('ilogb','ilogb',[ilogb_input_type],ilogb_output_type,[ilogb_input_values],'0 * INT_ULP')
+
+  ##### gentype lgamma(gentype x)
+  lgamma_input_values = base_input_values
+  lgamma_input_type = ['float','float2','float4','float8','float16']
+  lgamma_output_type = ['float','float2','float4','float8','float16']
+  lgammaUtests = func('lgamma','lgamma',[lgamma_input_type],lgamma_output_type,[lgamma_input_values],'4 * FLT_ULP')
+
+  ##### gentype log(gentype)
+  log_input_values = base_input_values
+  log_input_type = ['float','float2','float4','float8','float16']
+  log_output_type = ['float','float2','float4','float8','float16']
+  logUtests = func('log','log',[log_input_type],log_output_type,[log_input_values],'4 * FLT_ULP')
+  
+  ##### gentype log2(gentype)
+  log2_input_values = base_input_values
+  log2_input_type = ['float','float2','float4','float8','float16']
+  log2_output_type = ['float','float2','float4','float8','float16']
+  log2Utests = func('log2','log2',[log2_input_type],log2_output_type,[log2_input_values],'4 * FLT_ULP')
+  
+  ##### gentype log10(gentype)
+  log10_input_values = base_input_values
+  log10_input_type = ['float','float2','float4','float8','float16']
+  log10_output_type = ['float','float2','float4','float8','float16']
+  log10Utests = func('log10','log10',[log10_input_type],log10_output_type,[log10_input_values],'4 * FLT_ULP')
+  
+  ##### gentype log1p(gentype x)
+  log1p_input_values = base_input_values
+  log1p_input_type = ['float','float2','float4','float8','float16']
+  log1p_output_type = ['float','float2','float4','float8','float16']
+  log1pUtests = func('log1p','log1p',[log1p_input_type],log1p_output_type,[log1p_input_values],'4 * FLT_ULP')
+  
+  ##### gentype logb(gentype x)
+  logb_input_values = base_input_values
+  logb_input_type = ['float','float2','float4','float8','float16']
+  logb_output_type = ['float','float2','float4','float8','float16']
+  logbUtests = func('logb','logb',[logb_input_type],logb_output_type,[logb_input_values],'0 * FLT_ULP')
+  
+  ##### gentype maxmag(gentype x, gentype y)
+  maxmag_base_values = base_input_values
+  maxmag_input_values1 = []
+  maxmag_input_values2 = []
+  maxmag_input_values1,maxmag_input_values2=gene2ValuesLoop(maxmag_input_values1,maxmag_input_values2,maxmag_base_values)
+  maxmag_input_type1 = ['float','float2','float4','float8','float16']
+  maxmag_input_type2 = ['float','float2','float4','float8','float16']
+  maxmag_output_type = ['float','float2','float4','float8','float16']
+  maxmag_cpu_func='''
+static float maxmag(float x, float y){
+  if(fabs(x) > fabs(y))
+    return x;
+  else if (fabs(x) < fabs(y))
+    return y;
+  else
+    return fmax(x,y);
+} '''
+  maxmagUtests = func('maxmag','maxmag',[maxmag_input_type1,maxmag_input_type2],maxmag_output_type,[maxmag_input_values1,maxmag_input_values2],'0 * FLT_ULP',maxmag_cpu_func)
+  
+  ##### gentype minmag(gentype x, gentype y)
+  minmag_base_values = base_input_values
+  minmag_input_values1 = []
+  minmag_input_values2 = []
+  minmag_input_values1,minmag_input_values2=gene2ValuesLoop(minmag_input_values1,minmag_input_values2,minmag_base_values)
+  minmag_input_type1 = ['float','float2','float4','float8','float16']
+  minmag_input_type2 = ['float','float2','float4','float8','float16']
+  minmag_output_type = ['float','float2','float4','float8','float16']
+  minmag_cpu_func='''
+static float minmag(float x, float y){
+  if(fabs(x) < fabs(y))
+    return x;
+  else if (fabs(x) > fabs(y))
+    return y;
+  else
+    return fmin(x,y);
+} '''
+  minmagUtests = func('minmag','minmag',[minmag_input_type1,minmag_input_type2],minmag_output_type,[minmag_input_values1,minmag_input_values2],'0 * FLT_ULP',minmag_cpu_func)
+  
+#  ##### floatn nan(uintn nancode)
+#  nan_input_values = base_input_values
+#  nan_input_type = ['uint','uint2','uint4','uint8','uint16']
+#  nan_output_type = ['float','float2','float4','float8','float16']
+#  nanUtests = func('nan','nan',[nan_input_type],nan_output_type,[nan_input_values],'0 * FLT_ULP')
+  
+  ##### gentype nextafter(gentype x, gentype y)
+  nextafter_base_values = base_input_values
+  nextafter_input_values1 = []
+  nextafter_input_values2 = []
+  nextafter_input_values1,nextafter_input_values2=gene2ValuesLoop(nextafter_input_values1,nextafter_input_values2,nextafter_base_values)
+  nextafter_input_type1 = ['float','float2','float4','float8','float16']
+  nextafter_input_type2 = ['float','float2','float4','float8','float16']
+  nextafter_output_type = ['float','float2','float4','float8','float16']
+  nextafterUtests = func('nextafter','nextafterf',[nextafter_input_type1,nextafter_input_type2],nextafter_output_type,[nextafter_input_values1,nextafter_input_values2],'0 * FLT_ULP')
+  
+  ##### gentype pow(gentype x, gentype y)
+  pow_base_values = base_input_values1
+  pow_input_values1 = []
+  pow_input_values2 = []
+  pow_input_values1,pow_input_values2=gene2ValuesLoop(pow_input_values1,pow_input_values2,pow_base_values)
+  pow_input_type1 = ['float','float2','float4','float8','float16']
+  pow_input_type2 = ['float','float2','float4','float8','float16']
+  pow_output_type = ['float','float2','float4','float8','float16']
+  powUtests = func('pow','powf',[pow_input_type1,pow_input_type2],pow_output_type,[pow_input_values1,pow_input_values2],'16 * FLT_ULP')
+  
+  ##### floatn pown(floatn x, intn y)
+  pown_input_values1 = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, 0.5, 1, 0.0,1500.24,-1500.24]
+  pown_input_values2 = [-1,-2,-3,4,5,6,7,8,10,12,14,16,12]
+  pown_input_type1 = ['float','float2','float4','float8','float16']
+  pown_input_type2 = ['int','int2','int4','int8','int16']
+  pown_output_type = ['float','float2','float4','float8','float16']
+  pown_cpu_func='''
+static float pown(float x, int y){
+    return pow(x,y);
+} '''
+  pownUtests = func('pown','pown',[pown_input_type1,pown_input_type2],pown_output_type,[pown_input_values1,pown_input_values2],'16 * FLT_ULP', pown_cpu_func)
+  
+  ##### gentype powr(gentype x, gentype y)
+  powr_input_values1 = [80, -80, 3.14, -3.14, 0.5, 1, -1, 0.0,6,1500.24,-1500.24]
+  powr_input_values2 = [5,6,7,8,10,11,12,13,14,0,12]
+  powr_input_type1 = ['float','float2','float4','float8','float16']
+  powr_input_type2 = ['float','float2','float4','float8','float16']
+  powr_output_type = ['float','float2','float4','float8','float16']
+  powr_cpu_func='''
+static float powr(float x, int y){
+    return powf(x,y);
+} '''
+  powrUtests = func('powr','powr',[powr_input_type1,powr_input_type2],powr_output_type,[powr_input_values1,powr_input_values2],'16 * FLT_ULP', powr_cpu_func)
+  
+  ##### gentype remainder(gentype x, gentype y)
+  remainder_base_values = base_input_values
+  remainder_input_values1 = []
+  remainder_input_values2 = []
+  remainder_input_values1,remainder_input_values2=gene2ValuesLoop(remainder_input_values1,remainder_input_values2,remainder_base_values)
+  remainder_input_type1 = ['float','float2','float4','float8','float16']
+  remainder_input_type2 = ['float','float2','float4','float8','float16']
+  remainder_output_type = ['float','float2','float4','float8','float16']
+  remainderUtests = func('remainder','remainder',[remainder_input_type1,remainder_input_type2],remainder_output_type,[remainder_input_values1,remainder_input_values2],'0 * FLT_ULP')
+  
+  ##### gentype rint(gentype x)
+  rint_input_values = base_input_values
+  rint_input_type = ['float','float2','float4','float8','float16']
+  rint_output_type = ['float','float2','float4','float8','float16']
+  rintUtests = func('rint','rint',[rint_input_type],rint_output_type,[rint_input_values],'0 * FLT_ULP')
+  
+  ##### floatn rootn(floatn x, intn y)
+  rootn_input_values1 = [0.0, 0.0012,  0.5, 1, 3.14, 12345]
+  rootn_input_values2 = [-1, 1, -20, 20, -123, 456]
+  rootn_input_type1 = ['float','float2','float4','float8','float16']
+  rootn_input_type2 = ['int','int2','int4','int8','int16']
+  rootn_output_type = ['float','float2','float4','float8','float16']
+  rootn_cpu_func='''
+static float rootn(float x, int y){
+    return pow(x,1.0/y);
+} '''
+  rootnUtests = func('rootn','rootn',[rootn_input_type1,rootn_input_type2],rootn_output_type,[rootn_input_values1,rootn_input_values2],'4 * FLT_ULP',rootn_cpu_func)
+  
+  ##### gentype round(gentype x)
+  round_input_values = base_input_values
+  round_input_type = ['float','float2','float4','float8','float16']
+  round_output_type = ['float','float2','float4','float8','float16']
+  roundUtests = func('round','round',[round_input_type],round_output_type,[round_input_values],'0 * FLT_ULP')
+  
+  ##### gentype rsqrt(gentype)
+  rsqrt_input_values = base_input_values
+  rsqrt_input_type = ['float','float2','float4','float8','float16']
+  rsqrt_output_type = ['float','float2','float4','float8','float16']
+  rsqrt_cpu_func='''
+static float rsqrt(float x)
+{ return 1/sqrt(x);} '''
+  rsqrtUtests = func('rsqrt','rsqrt',[rsqrt_input_type],rsqrt_output_type,[rsqrt_input_values],'4 * FLT_ULP', rsqrt_cpu_func)
+
+ 
+  ##### gentype sin(gentype)
+  sin_input_values = base_input_values
+  sin_input_type = ['float','float2','float4','float8','float16']
+  sin_output_type = ['float','float2','float4','float8','float16']
+  sinUtests = func('sin','sin',[sin_input_type],sin_output_type,[sin_input_values],'4 * FLT_ULP')
+  
+#  ##### gentype sincos(gentype)
+#  sincos_input_values1 = [FLT_MAX_POSI,FLT_MIN_NEGA,FLT_MIN_POSI,FLT_MAX_NEGA,80, -80, 3.14, -3.14, -0.5, 0.5, 1, -1, 0.0,6,-6,1500.24,-1500.24]
+#  sincos_input_values2 = []
+#  sincos_input_type1 = ['float','float2','float4','float8','float16']
+#  sincos_input_type2 = ['float','float2','float4','float8','float16']
+#  sincos_output_type = ['float','float2','float4','float8','float16']
+#  ###### gentype sincos(gentype)
+#  #  sincosUtests = func('sincos','sincos',[sincos_input_type1,sincos_input_type2],sincos_output_type,[sincos_input_values1,sincos_input_values2],'4 * FLT_ULP')
+  
+  ##### gentype sinh(gentype)
+  sinh_input_values = base_input_values
+  sinh_input_type = ['float','float2','float4','float8','float16']
+  sinh_output_type = ['float','float2','float4','float8','float16']
+  sinhUtests = func('sinh','sinh',[sinh_input_type],sinh_output_type,[sinh_input_values],'4 * FLT_ULP')
+  
+  ##### gentype sinpi(gentype x)
+  sinpi_input_values = [0, 1, 3.14, -0.88, -0.12, -0.5, 0.5, -0.49, 0.49, 0.51, -0.51, -0.1, 0.1]
+  sinpi_input_type = ['float','float2','float4','float8','float16']
+  sinpi_output_type = ['float','float2','float4','float8','float16']
+  sinpi_cpu_func=reduce1+sinpi
+  sinpiUtests = func('sinpi','sinpi',[sinpi_input_type],sinpi_output_type,[sinpi_input_values],'4 * FLT_ULP',sinpi_cpu_func)
+  
+  ##### gentype sqrt(gentype)
+  sqrt_input_values = base_input_values
+  sqrt_input_type = ['float','float2','float4','float8','float16']
+  sqrt_output_type = ['float','float2','float4','float8','float16']
+  sqrtUtests = func('sqrt','sqrt',[sqrt_input_type],sqrt_output_type,[sqrt_input_values],'4 * FLT_ULP')
+  
+  ##### gentype tan(gentype)
+  tan_input_values = base_input_values
+  tan_input_type = ['float','float2','float4','float8','float16']
+  tan_output_type = ['float','float2','float4','float8','float16']
+  tanUtests = func('tan','tan',[tan_input_type],tan_output_type,[tan_input_values],'5 * FLT_ULP')
+  
+  ##### gentype tanh(gentype)
+  tanh_input_values = base_input_values
+  tanh_input_type = ['float','float2','float4','float8','float16']
+  tanh_output_type = ['float','float2','float4','float8','float16']
+  tanhUtests = func('tanh','tanh',[tanh_input_type],tanh_output_type,[tanh_input_values],'5 * FLT_ULP')
+  
+  ##### gentype tanpi(gentype x)
+  tanpi_input_values = [ 0, 3.14, 5.15, 6.01, 7.89]
+  tanpi_input_type = ['float','float2','float4','float8','float16']
+  tanpi_output_type = ['float','float2','float4','float8','float16']
+  tanpi_cpu_func=reduce1+sinpi+cospi+'''
+static float tanpi(float x){
+  return sinpi(x)/cospi(x);
+}
+'''
+  tanpiUtests = func('tanpi','tanpi',[tanpi_input_type],tanpi_output_type,[tanpi_input_values],'400 * FLT_ULP',tanpi_cpu_func)
+  
+  ##### gentype trunc(gentype)
+  trunc_input_values = base_input_values
+  trunc_input_type = ['float','float2','float4','float8','float16']
+  trunc_output_type = ['float','float2','float4','float8','float16']
+  truncUtests = func('trunc','trunc',[trunc_input_type],trunc_output_type,[trunc_input_values],'0 * FLT_ULP')
+
+if __name__ == "__main__":
+  main()
diff --git a/utests/utest_run.cpp b/utests/utest_run.cpp
new file mode 100644
index 0000000..cd4356a
--- /dev/null
+++ b/utests/utest_run.cpp
@@ -0,0 +1,118 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia at intel.com>
+ */
+
+/**
+ * \file utest_run.cpp
+ * \author Benjamin Segovia <benjamin.segovia at intel.com>
+ *
+ * Just run the unit tests. The user can possibly provides the subset of it
+ */
+#include "utest_helper.hpp"
+#include "utest_exception.hpp"
+#include <iostream>
+#include <getopt.h>
+
+static const char *shortopts = "c:lanh";
+struct option longopts[] = {
+{"casename", required_argument, NULL, 'c'},
+{"list", no_argument, NULL, 'l'},
+{"all", no_argument, NULL, 'a'},
+{"allnoissue", no_argument, NULL, 'n'},
+{"help", no_argument, NULL, 'h'},
+{0, 0, 0, 0},
+};
+
+void usage()
+{
+    std::cout << "\
+Usage:\n\
+  ./utest_run <option>\n\
+\n\
+  option:\n\
+    -c <casename>: run sub-case named 'casename'\n\
+    -l           : list all the available case name\n\
+    -a           : run all test cases\n\
+    -n           : run all test cases without known issue (default option)\n\
+    -h           : display this usage\n\
+\
+    "<< std::endl;
+}
+
+int main(int argc, char *argv[])
+{
+
+  int c = 0;
+  cl_ocl_init();
+
+  c = getopt_long (argc, argv, shortopts, longopts, NULL);
+
+  if (argc == 1)
+    c = 'n';
+  if (argc == 2 && c < 1 ){
+    c = 'c';
+    optarg = argv[1];
+  }
+
+  do {
+    switch (c)
+    {
+      case 'c':
+        try {
+          UTest::run(optarg);
+        }
+        catch (Exception e){
+          std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+        }
+
+        break;
+
+      case 'l':
+        UTest::listAllCases();
+        break;
+
+      case 'a':
+        try {
+          UTest::runAll();
+        }
+        catch (Exception e){
+          std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+        }
+
+        break;
+
+      case 'n':
+        try {
+          UTest::runAllNoIssue();
+        }
+        catch (Exception e){
+          std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+        }
+
+        break;
+
+      case 'h':
+      default:
+        usage();
+        exit(1);
+    }
+  } while ((c = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1);
+
+  cl_ocl_destroy();
+}
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git



More information about the Pkg-opencl-devel mailing list